diff options
author | Steven Whitehouse <steve@men-an-tol.chygwyn.com> | 2006-02-23 09:49:43 (GMT) |
---|---|---|
committer | Steven Whitehouse <swhiteho@redhat.com> | 2006-02-23 09:49:43 (GMT) |
commit | d35462b4bb847b68321c55e95c926aa485aecce2 (patch) | |
tree | b08e18bf6e672633402871ee763102fdb5e63229 /kernel | |
parent | 91ffd7db71e7451f89941a8f428b4daa2a7c1e38 (diff) | |
parent | 9e956c2dac9bec602ed1ba29181b45ba6d2b6448 (diff) | |
download | linux-d35462b4bb847b68321c55e95c926aa485aecce2.tar.xz |
Merge branch 'master'
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/auditsc.c | 6 | ||||
-rw-r--r-- | kernel/compat.c | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 37 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | kernel/fork.c | 405 | ||||
-rw-r--r-- | kernel/hrtimer.c | 83 | ||||
-rw-r--r-- | kernel/intermodule.c | 3 | ||||
-rw-r--r-- | kernel/itimer.c | 11 | ||||
-rw-r--r-- | kernel/kprobes.c | 36 | ||||
-rw-r--r-- | kernel/module.c | 6 | ||||
-rw-r--r-- | kernel/panic.c | 1 | ||||
-rw-r--r-- | kernel/posix-timers.c | 53 | ||||
-rw-r--r-- | kernel/power/console.c | 16 | ||||
-rw-r--r-- | kernel/power/disk.c | 15 | ||||
-rw-r--r-- | kernel/power/main.c | 4 | ||||
-rw-r--r-- | kernel/power/power.h | 15 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 4 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 18 | ||||
-rw-r--r-- | kernel/ptrace.c | 28 | ||||
-rw-r--r-- | kernel/rcutorture.c | 10 | ||||
-rw-r--r-- | kernel/sched.c | 176 | ||||
-rw-r--r-- | kernel/signal.c | 11 | ||||
-rw-r--r-- | kernel/sys.c | 27 | ||||
-rw-r--r-- | kernel/sys_ni.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 32 | ||||
-rw-r--r-- | kernel/time.c | 15 | ||||
-rw-r--r-- | kernel/timer.c | 41 | ||||
-rw-r--r-- | kernel/user.c | 32 |
28 files changed, 682 insertions, 409 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 685c251..d7e7e63 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -841,7 +841,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) for (aux = context->aux; aux; aux = aux->next) { - ab = audit_log_start(context, GFP_KERNEL, aux->type); + ab = audit_log_start(context, gfp_mask, aux->type); if (!ab) continue; /* audit_panic has been called */ @@ -878,14 +878,14 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) } if (context->pwd && context->pwdmnt) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); + ab = audit_log_start(context, gfp_mask, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); audit_log_end(ab); } } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + ab = audit_log_start(context, gfp_mask, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ diff --git a/kernel/compat.c b/kernel/compat.c index 1867290..8c9cd88b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,7 +23,6 @@ #include <linux/security.h> #include <asm/uaccess.h> -#include <asm/bug.h> int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fe2f71f..12815d3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -641,7 +641,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -void cpuset_update_task_memory_state() +void cpuset_update_task_memory_state(void) { int my_cpusets_mem_gen; struct task_struct *tsk = current; @@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child) * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. + * + * Hack: + * + * Set the exiting tasks cpuset to the root cpuset (top_cpuset). + * + * Don't leave a task unable to allocate memory, as that is an + * accident waiting to happen should someone add a callout in + * do_exit() after the cpuset_exit() call that might allocate. + * If a task tries to allocate memory with an invalid cpuset, + * it will oops in cpuset_update_task_memory_state(). + * + * We call cpuset_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to + * the root cpuset (top_cpuset) for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cpuset, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cpuset function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cpuset reference count, to no avail. + * + * Normally, holding a reference to a cpuset without bumping its + * count is unsafe. The cpuset could go away, or someone could + * attach us to a different cpuset, decrementing the count on + * the first cpuset that we never incremented. But in this case, + * top_cpuset isn't going away, and either task has PF_EXITING set, + * which wards off any attach_task() attempts, or task is a failed + * fork, never visible to attach_task. + * + * Another way to do this would be to set the cpuset pointer + * to NULL here, and check in cpuset_update_task_memory_state() + * for a NULL pointer. This hack avoids that NULL check, for no + * cost (other than this way too long comment ;). **/ void cpuset_exit(struct task_struct *tsk) @@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = NULL; + tsk->cpuset = &top_cpuset; /* Hack - see comment above */ if (notify_on_release(cs)) { char *pathbuf = NULL; diff --git a/kernel/exit.c b/kernel/exit.c index 93cee36..531aadc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -360,6 +360,9 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); + exit_namespace(current); + current->namespace = init_task.namespace; + get_namespace(current->namespace); exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); diff --git a/kernel/fork.c b/kernel/fork.c index 4ae8cfc..fbea12d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,55 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } } +/* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. + */ +static struct mm_struct *dup_mm(struct task_struct *tsk) +{ + struct mm_struct *mm, *oldmm = current->mm; + int err; + + if (!oldmm) + return NULL; + + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); + + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) + goto fail_nocontext; + + err = dup_mmap(mm, oldmm); + if (err) + goto free_pt; + + mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_vm = mm->total_vm; + + return mm; + +free_pt: + mmput(mm); + +fail_nomem: + return NULL; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return NULL; +} + static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; @@ -473,43 +522,17 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) } retval = -ENOMEM; - mm = allocate_mm(); + mm = dup_mm(tsk); if (!mm) goto fail_nomem; - /* Copy the current MM stuff.. */ - memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm)) - goto fail_nomem; - - if (init_new_context(tsk,mm)) - goto fail_nocontext; - - retval = dup_mmap(mm, oldmm); - if (retval) - goto free_pt; - - mm->hiwater_rss = get_mm_rss(mm); - mm->hiwater_vm = mm->total_vm; - good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; -free_pt: - mmput(mm); fail_nomem: return retval; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return retval; } static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) @@ -597,32 +620,17 @@ out: return newf; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + */ +static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { - struct files_struct *oldf, *newf; + struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, error = 0, expand; + int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; - /* - * A background process may not have any files ... - */ - oldf = current->files; - if (!oldf) - goto out; - - if (clone_flags & CLONE_FILES) { - atomic_inc(&oldf->count); - goto out; - } - - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; - error = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -651,9 +659,9 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); - error = expand_files(newf, open_files-1); + *errorp = expand_files(newf, open_files-1); spin_unlock(&newf->file_lock); - if (error < 0) + if (*errorp < 0) goto out_release; new_fdt = files_fdtable(newf); /* @@ -702,10 +710,8 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } - tsk->files = newf; - error = 0; out: - return error; + return newf; out_release: free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); @@ -715,6 +721,40 @@ out_release: goto out; } +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + int error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + error = -ENOMEM; + newf = dup_fd(oldf, &error); + if (!newf) + goto out; + + tsk->files = newf; + error = 0; +out: + return error; +} + /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to @@ -802,7 +842,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC); + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; sig->real_timer.data = tsk; @@ -1083,8 +1123,8 @@ static task_t *copy_process(unsigned long clone_flags, p->real_parent = current; p->parent = p->real_parent; + spin_lock(¤t->sighand->siglock); if (clone_flags & CLONE_THREAD) { - spin_lock(¤t->sighand->siglock); /* * Important: if an exit-all has been started then * do not create this new thread - the whole thread @@ -1122,8 +1162,6 @@ static task_t *copy_process(unsigned long clone_flags, */ p->it_prof_expires = jiffies_to_cputime(1); } - - spin_unlock(¤t->sighand->siglock); } /* @@ -1135,8 +1173,6 @@ static task_t *copy_process(unsigned long clone_flags, if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); - attach_pid(p, PIDTYPE_PID, p->pid); - attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); @@ -1146,9 +1182,12 @@ static task_t *copy_process(unsigned long clone_flags, if (p->pid) __get_cpu_var(process_counts)++; } + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PID, p->pid); nr_threads++; total_forks++; + spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); return p; @@ -1323,3 +1362,249 @@ void __init proc_caches_init(void) sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } + + +/* + * Check constraints on flags passed to the unshare system call and + * force unsharing of additional process context as appropriate. + */ +static inline void check_unshare_flags(unsigned long *flags_ptr) +{ + /* + * If unsharing a thread from a thread group, must also + * unshare vm. + */ + if (*flags_ptr & CLONE_THREAD) + *flags_ptr |= CLONE_VM; + + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (*flags_ptr & CLONE_VM) + *flags_ptr |= CLONE_SIGHAND; + + /* + * If unsharing signal handlers and the task was created + * using CLONE_THREAD, then must unshare the thread + */ + if ((*flags_ptr & CLONE_SIGHAND) && + (atomic_read(¤t->signal->count) > 1)) + *flags_ptr |= CLONE_THREAD; + + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (*flags_ptr & CLONE_NEWNS) + *flags_ptr |= CLONE_FS; +} + +/* + * Unsharing of tasks created with CLONE_THREAD is not supported yet + */ +static int unshare_thread(unsigned long unshare_flags) +{ + if (unshare_flags & CLONE_THREAD) + return -EINVAL; + + return 0; +} + +/* + * Unshare the filesystem structure if it is being shared + */ +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +{ + struct fs_struct *fs = current->fs; + + if ((unshare_flags & CLONE_FS) && + (fs && atomic_read(&fs->count) > 1)) { + *new_fsp = __copy_fs_struct(current->fs); + if (!*new_fsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unshare the namespace structure if it is being shared + */ +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +{ + struct namespace *ns = current->namespace; + + if ((unshare_flags & CLONE_NEWNS) && + (ns && atomic_read(&ns->count) > 1)) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + if (!*new_nsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unsharing of sighand for tasks created with CLONE_SIGHAND is not + * supported yet + */ +static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) +{ + struct sighand_struct *sigh = current->sighand; + + if ((unshare_flags & CLONE_SIGHAND) && + (sigh && atomic_read(&sigh->count) > 1)) + return -EINVAL; + else + return 0; +} + +/* + * Unshare vm if it is being shared + */ +static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) +{ + struct mm_struct *mm = current->mm; + + if ((unshare_flags & CLONE_VM) && + (mm && atomic_read(&mm->mm_users) > 1)) { + *new_mmp = dup_mm(current); + if (!*new_mmp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unshare file descriptor table if it is being shared + */ +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +{ + struct files_struct *fd = current->files; + int error = 0; + + if ((unshare_flags & CLONE_FILES) && + (fd && atomic_read(&fd->count) > 1)) { + *new_fdp = dup_fd(fd, &error); + if (!*new_fdp) + return error; + } + + return 0; +} + +/* + * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not + * supported yet + */ +static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) +{ + if (unshare_flags & CLONE_SYSVSEM) + return -EINVAL; + + return 0; +} + +/* + * unshare allows a process to 'unshare' part of the process + * context which was originally shared using clone. copy_* + * functions used by do_fork() cannot be used here directly + * because they modify an inactive task_struct that is being + * constructed. Here we are modifying the current, active, + * task_struct. + */ +asmlinkage long sys_unshare(unsigned long unshare_flags) +{ + int err = 0; + struct fs_struct *fs, *new_fs = NULL; + struct namespace *ns, *new_ns = NULL; + struct sighand_struct *sigh, *new_sigh = NULL; + struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; + struct files_struct *fd, *new_fd = NULL; + struct sem_undo_list *new_ulist = NULL; + + check_unshare_flags(&unshare_flags); + + if ((err = unshare_thread(unshare_flags))) + goto bad_unshare_out; + if ((err = unshare_fs(unshare_flags, &new_fs))) + goto bad_unshare_cleanup_thread; + if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + goto bad_unshare_cleanup_fs; + if ((err = unshare_sighand(unshare_flags, &new_sigh))) + goto bad_unshare_cleanup_ns; + if ((err = unshare_vm(unshare_flags, &new_mm))) + goto bad_unshare_cleanup_sigh; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_vm; + if ((err = unshare_semundo(unshare_flags, &new_ulist))) + goto bad_unshare_cleanup_fd; + + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + + task_lock(current); + + if (new_fs) { + fs = current->fs; + current->fs = new_fs; + new_fs = fs; + } + + if (new_ns) { + ns = current->namespace; + current->namespace = new_ns; + new_ns = ns; + } + + if (new_sigh) { + sigh = current->sighand; + current->sighand = new_sigh; + new_sigh = sigh; + } + + if (new_mm) { + mm = current->mm; + active_mm = current->active_mm; + current->mm = new_mm; + current->active_mm = new_mm; + activate_mm(active_mm, new_mm); + new_mm = mm; + } + + if (new_fd) { + fd = current->files; + current->files = new_fd; + new_fd = fd; + } + + task_unlock(current); + } + +bad_unshare_cleanup_fd: + if (new_fd) + put_files_struct(new_fd); + +bad_unshare_cleanup_vm: + if (new_mm) + mmput(new_mm); + +bad_unshare_cleanup_sigh: + if (new_sigh) + if (atomic_dec_and_test(&new_sigh->count)) + kmem_cache_free(sighand_cachep, new_sigh); + +bad_unshare_cleanup_ns: + if (new_ns) + put_namespace(new_ns); + +bad_unshare_cleanup_fs: + if (new_fs) + put_fs_struct(new_fs); + +bad_unshare_cleanup_thread: +bad_unshare_out: + return err; +} diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f1c4155..5ae51f1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -21,6 +21,12 @@ * Credits: * based on kernel/timer.c * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * * For licencing details see kernel-base/COPYING */ @@ -66,6 +72,12 @@ EXPORT_SYMBOL_GPL(ktime_get_real); /* * The timer bases: + * + * Note: If we want to add new timer bases, we have to skip the two + * clock ids captured by the cpu-timers. We do this by holding empty + * entries rather than doing math adjustment of the clock ids. + * This ensures that we capture erroneous accesses to these clock ids + * rather than moving them into the range of valid clock id's. */ #define MAX_HRTIMER_BASES 2 @@ -406,8 +418,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base); - if (mode == HRTIMER_REL) + if (mode == HRTIMER_REL) { tim = ktime_add(tim, new_base->get_time()); + /* + * CONFIG_TIME_LOW_RES is a temporary way for architectures + * to signal that they simply return xtime in + * do_gettimeoffset(). In this case we want to round up by + * resolution when starting a relative timer, to avoid short + * timeouts. This will go away with the GTOD framework. + */ +#ifdef CONFIG_TIME_LOW_RES + tim = ktime_add(tim, base->resolution); +#endif + } timer->expires = tim; enqueue_hrtimer(timer, new_base); @@ -483,29 +506,25 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } /** - * hrtimer_rebase - rebase an initialized hrtimer to a different base + * hrtimer_init - initialize a timer to the given clock * - * @timer: the timer to be rebased + * @timer: the timer to be initialized * @clock_id: the clock to be used + * @mode: timer mode abs/rel */ -void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id) +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) { struct hrtimer_base *bases; + memset(timer, 0, sizeof(struct hrtimer)); + bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); - timer->base = &bases[clock_id]; -} -/** - * hrtimer_init - initialize a timer to the given clock - * - * @timer: the timer to be initialized - * @clock_id: the clock to be used - */ -void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id) -{ - memset(timer, 0, sizeof(struct hrtimer)); - hrtimer_rebase(timer, clock_id); + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) + clock_id = CLOCK_MONOTONIC; + + timer->base = &bases[clock_id]; } /** @@ -550,6 +569,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) fn = timer->function; data = timer->data; set_curr_timer(base, timer); + timer->state = HRTIMER_RUNNING; __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); @@ -565,6 +585,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) spin_lock_irq(&base->lock); + /* Another CPU has added back the timer */ + if (timer->state != HRTIMER_RUNNING) + continue; + if (restart == HRTIMER_RESTART) enqueue_hrtimer(timer, base); else @@ -638,8 +662,7 @@ schedule_hrtimer_interruptible(struct hrtimer *timer, return schedule_hrtimer(timer, mode); } -static long __sched -nanosleep_restart(struct restart_block *restart, clockid_t clockid) +static long __sched nanosleep_restart(struct restart_block *restart) { struct timespec __user *rmtp; struct timespec tu; @@ -649,7 +672,7 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid) restart->fn = do_no_restart_syscall; - hrtimer_init(&timer, clockid); + hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; @@ -669,16 +692,6 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid) return -ERESTART_RESTARTBLOCK; } -static long __sched nanosleep_restart_mono(struct restart_block *restart) -{ - return nanosleep_restart(restart, CLOCK_MONOTONIC); -} - -static long __sched nanosleep_restart_real(struct restart_block *restart) -{ - return nanosleep_restart(restart, CLOCK_REALTIME); -} - long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { @@ -687,7 +700,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct timespec tu; ktime_t rem; - hrtimer_init(&timer, clockid); + hrtimer_init(&timer, clockid, mode); timer.expires = timespec_to_ktime(*rqtp); @@ -695,7 +708,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, if (rem.tv64 <= 0) return 0; - /* Absolute timers do not update the rmtp value: */ + /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_ABS) return -ERESTARTNOHAND; @@ -705,11 +718,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, return -EFAULT; restart = ¤t_thread_info()->restart_block; - restart->fn = (clockid == CLOCK_MONOTONIC) ? - nanosleep_restart_mono : nanosleep_restart_real; + restart->fn = nanosleep_restart; restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; restart->arg1 = timer.expires.tv64 >> 32; restart->arg2 = (unsigned long) rmtp; + restart->arg3 = (unsigned long) timer.base->index; return -ERESTART_RESTARTBLOCK; } @@ -736,10 +749,8 @@ static void __devinit init_hrtimers_cpu(int cpu) struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++) { + for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) spin_lock_init(&base->lock); - base++; - } } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 0cbe633..55b1e5b 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -179,3 +179,6 @@ EXPORT_SYMBOL(inter_module_register); EXPORT_SYMBOL(inter_module_unregister); EXPORT_SYMBOL(inter_module_get_request); EXPORT_SYMBOL(inter_module_put); + +MODULE_LICENSE("GPL"); + diff --git a/kernel/itimer.c b/kernel/itimer.c index c2c05c4..379be2f 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -49,9 +49,11 @@ int do_getitimer(int which, struct itimerval *value) switch (which) { case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: read_lock(&tasklist_lock); @@ -150,18 +152,25 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) switch (which) { case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); timer = &tsk->signal->real_timer; - hrtimer_cancel(timer); if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval = ktime_to_timeval(tsk->signal->it_real_incr); } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } tsk->signal->it_real_incr = timeval_to_ktime(value->it_interval); expires = timeval_to_ktime(value->it_value); if (expires.tv64 != 0) hrtimer_start(timer, expires, HRTIMER_REL); + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3ea6325..fef1af8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -344,23 +344,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) spin_unlock_irqrestore(&kretprobe_lock, flags); } -/* - * This kprobe pre_handler is registered with every kretprobe. When probe - * hits it will set up the return probe. - */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long flags = 0; - - /*TODO: consider to only swap the RA after the last pre_handler fired */ - spin_lock_irqsave(&kretprobe_lock, flags); - arch_prepare_kretprobe(rp, regs); - spin_unlock_irqrestore(&kretprobe_lock, flags); - return 0; -} - static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; @@ -578,6 +561,23 @@ void __kprobes unregister_jprobe(struct jprobe *jp) #ifdef ARCH_SUPPORTS_KRETPROBES +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + unsigned long flags = 0; + + /*TODO: consider to only swap the RA after the last pre_handler fired */ + spin_lock_irqsave(&kretprobe_lock, flags); + arch_prepare_kretprobe(rp, regs); + spin_unlock_irqrestore(&kretprobe_lock, flags); + return 0; +} + int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; @@ -631,12 +631,12 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) unregister_kprobe(&rp->kp); /* No race here */ spin_lock_irqsave(&kretprobe_lock, flags); - free_rp_inst(rp); while ((ri = get_used_rp_inst(rp)) != NULL) { ri->rp = NULL; hlist_del(&ri->uflist); } spin_unlock_irqrestore(&kretprobe_lock, flags); + free_rp_inst(rp); } static int __init init_kprobes(void) diff --git a/kernel/module.c b/kernel/module.c index 618ed6e..5aad477 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1670,6 +1670,9 @@ static struct module *load_module(void __user *umod, goto free_mod; } + /* Userspace could have altered the string after the strlen_user() */ + args[arglen - 1] = '\0'; + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2092,7 +2095,8 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) unsigned int i; for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0) + if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && + mod->symtab[i].st_info != 'U') return mod->symtab[i].st_value; return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index c5c4ab2..126dc43 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -130,6 +130,7 @@ NORET_TYPE void panic(const char * fmt, ...) #endif local_irq_enable(); for (i = 0;;) { + touch_softlockup_watchdog(); i += panic_blink(i); mdelay(1); i++; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 197208b..216f574 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -194,9 +194,7 @@ static inline int common_clock_set(const clockid_t which_clock, static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock); - new_timer->it.real.timer.data = new_timer; - new_timer->it.real.timer.function = posix_timer_fn; + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); return 0; } @@ -290,7 +288,8 @@ void do_schedule_next_timer(struct siginfo *info) info->si_overrun = timr->it_overrun_last; } - unlock_timer(timr, flags); + if (timr) + unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr,int si_private) @@ -692,6 +691,7 @@ common_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; if (old_setting) common_timer_get(timr, old_setting); @@ -713,14 +713,10 @@ common_timer_set(struct k_itimer *timr, int flags, if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - /* Posix madness. Only absolute CLOCK_REALTIME timers - * are affected by clock sets. So we must reiniatilize - * the timer. - */ - if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME)) - hrtimer_rebase(timer, CLOCK_REALTIME); - else - hrtimer_rebase(timer, CLOCK_MONOTONIC); + mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.data = timr; + timr->it.real.timer.function = posix_timer_fn; timer->expires = timespec_to_ktime(new_setting->it_value); @@ -728,11 +724,15 @@ common_timer_set(struct k_itimer *timr, int flags, timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + /* Setup correct expiry time for relative timers */ + if (mode == HRTIMER_REL) + timer->expires = ktime_add(timer->expires, + timer->base->get_time()); return 0; + } - hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ? - HRTIMER_ABS : HRTIMER_REL); + hrtimer_start(timer, timer->expires, mode); return 0; } @@ -875,12 +875,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) } EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); -int do_posix_clock_notimer_create(struct k_itimer *timer) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); - int do_posix_clock_nonanosleep(const clockid_t clock, int flags, struct timespec *t, struct timespec __user *r) { @@ -947,21 +941,8 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { - int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; - int clockid = which_clock; - - switch (which_clock) { - case CLOCK_REALTIME: - /* Posix madness. Only absolute timers on clock realtime - are affected by clock set. */ - if (mode != HRTIMER_ABS) - clockid = CLOCK_MONOTONIC; - case CLOCK_MONOTONIC: - break; - default: - return -EINVAL; - } - return hrtimer_nanosleep(tsave, rmtp, mode, clockid); + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_ABS : HRTIMER_REL, which_clock); } asmlinkage long diff --git a/kernel/power/console.c b/kernel/power/console.c index 7ff375e..623786d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,18 +9,13 @@ #include <linux/console.h> #include "power.h" -static int new_loglevel = 10; -static int orig_loglevel; -#ifdef SUSPEND_CONSOLE +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + static int orig_fgconsole, orig_kmsg; -#endif int pm_prepare_console(void) { - orig_loglevel = console_loglevel; - console_loglevel = new_loglevel; - -#ifdef SUSPEND_CONSOLE acquire_console_sem(); orig_fgconsole = fg_console; @@ -41,18 +36,15 @@ int pm_prepare_console(void) } orig_kmsg = kmsg_redirect; kmsg_redirect = SUSPEND_CONSOLE; -#endif return 0; } void pm_restore_console(void) { - console_loglevel = orig_loglevel; -#ifdef SUSPEND_CONSOLE acquire_console_sem(); set_console(orig_fgconsole); release_console_sem(); kmsg_redirect = orig_kmsg; -#endif return; } +#endif diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e24446f..0b43847 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -53,7 +53,7 @@ static void power_down(suspend_disk_method_t mode) switch(mode) { case PM_DISK_PLATFORM: - kernel_power_off_prepare(); + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: @@ -95,13 +95,6 @@ static int prepare_processes(void) goto thaw; } - if (pm_disk_mode == PM_DISK_PLATFORM) { - if (pm_ops && pm_ops->prepare) { - if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) - goto thaw; - } - } - /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; @@ -367,14 +360,14 @@ power_attr(resume); static ssize_t image_size_show(struct subsystem * subsys, char *buf) { - return sprintf(buf, "%u\n", image_size); + return sprintf(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) { - unsigned int size; + unsigned long size; - if (sscanf(buf, "%u", &size) == 1) { + if (sscanf(buf, "%lu", &size) == 1) { image_size = size; return n; } diff --git a/kernel/power/main.c b/kernel/power/main.c index d253f3a..9cb235c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -133,10 +133,10 @@ static int suspend_enter(suspend_state_t state) static void suspend_finish(suspend_state_t state) { device_resume(); - if (pm_ops && pm_ops->finish) - pm_ops->finish(state); thaw_processes(); enable_nonboot_cpus(); + if (pm_ops && pm_ops->finish) + pm_ops->finish(state); pm_restore_console(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index 7e8492f..388dba6 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,14 +1,6 @@ #include <linux/suspend.h> #include <linux/utsname.h> -/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but - we probably do not take enough locks for switching consoles, etc, - so bad things might happen. -*/ -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) -#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) -#endif - struct swsusp_info { struct new_utsname uts; u32 version_code; @@ -42,17 +34,14 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -extern int pm_prepare_console(void); -extern void pm_restore_console(void); - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; -/* Preferred image size in MB (default 500) */ -extern unsigned int image_size; +/* Preferred image size in bytes (default 500 MB) */ +extern unsigned long image_size; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 41f6636..8d5a598 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone) * corrected eventually when the cases giving rise to this * are better understood. */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); + if (PageReserved(page)) continue; - } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 55a18d2..2d9d08f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -70,12 +70,12 @@ #include "power.h" /* - * Preferred image size in MB (tunable via /sys/power/image_size). + * Preferred image size in bytes (tunable via /sys/power/image_size). * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N MB, but if that is impossible, it will + * size will not exceed N bytes, but if that is impossible, it will * try to create the smallest image possible. */ -unsigned int image_size = 500; +unsigned long image_size = 500 * 1024 * 1024; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); @@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */ { int i; - if (!swsusp_resume_device) - return -ENODEV; spin_lock(&swap_lock); for (i = 0; i < MAX_SWAPFILES; i++) { if (!(swap_info[i].flags & SWP_WRITEOK)) continue; - if (is_resume_device(swap_info + i)) { + if (!swsusp_resume_device || is_resume_device(swap_info + i)) { spin_unlock(&swap_lock); root_swap = i; return 0; @@ -590,7 +588,7 @@ int swsusp_shrink_memory(void) if (!tmp) return -ENOMEM; pages += tmp; - } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { + } else if (size > image_size / PAGE_SIZE) { tmp = shrink_all_memory(SHRINK_BITE); pages += tmp; } @@ -743,7 +741,6 @@ static int submit(int rw, pgoff_t page_off, void *page) if (!bio) return -ENOMEM; bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio_get(bio); bio->bi_bdev = resume_bdev; bio->bi_end_io = end_io; @@ -753,14 +750,13 @@ static int submit(int rw, pgoff_t page_off, void *page) goto Done; } - if (rw == WRITE) - bio_set_pages_dirty(bio); atomic_set(&io_done, 1); submit_bio(rw | (1 << BIO_RW_SYNC), bio); while (atomic_read(&io_done)) yield(); - + if (rw == READ) + bio_set_pages_dirty(bio); Done: bio_put(bio); return error; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5f33cdb..d95a72c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child) */ void __ptrace_unlink(task_t *child) { - if (!child->ptrace) - BUG(); + BUG_ON(!child->ptrace); + child->ptrace = 0; if (!list_empty(&child->ptrace_list)) { list_del_init(&child->ptrace_list); @@ -184,22 +184,27 @@ bad: return retval; } +void __ptrace_detach(struct task_struct *child, unsigned int data) +{ + child->exit_code = data; + /* .. re-parent .. */ + __ptrace_unlink(child); + /* .. and wake it up. */ + if (child->exit_state != EXIT_ZOMBIE) + wake_up_process(child); +} + int ptrace_detach(struct task_struct *child, unsigned int data) { if (!valid_signal(data)) - return -EIO; + return -EIO; /* Architecture-specific hardware disable .. */ ptrace_disable(child); - /* .. re-parent .. */ - child->exit_code = data; - write_lock_irq(&tasklist_lock); - __ptrace_unlink(child); - /* .. and wake it up. */ - if (child->exit_state != EXIT_ZOMBIE) - wake_up_process(child); + if (child->ptrace) + __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); return 0; @@ -242,8 +247,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - if (!PageCompound(page)) - set_page_dirty_lock(page); + set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7732199..7712912 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -114,16 +114,16 @@ rcu_torture_alloc(void) { struct list_head *p; - spin_lock(&rcu_torture_lock); + spin_lock_bh(&rcu_torture_lock); if (list_empty(&rcu_torture_freelist)) { atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock(&rcu_torture_lock); + spin_unlock_bh(&rcu_torture_lock); return NULL; } atomic_inc(&n_rcu_torture_alloc); p = rcu_torture_freelist.next; list_del_init(p); - spin_unlock(&rcu_torture_lock); + spin_unlock_bh(&rcu_torture_lock); return container_of(p, struct rcu_torture, rtort_free); } @@ -134,9 +134,9 @@ static void rcu_torture_free(struct rcu_torture *p) { atomic_inc(&n_rcu_torture_free); - spin_lock(&rcu_torture_lock); + spin_lock_bh(&rcu_torture_lock); list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock(&rcu_torture_lock); + spin_unlock_bh(&rcu_torture_lock); } static void diff --git a/kernel/sched.c b/kernel/sched.c index 3ee2ae4..12d291b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -215,7 +215,6 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long prio_bias; unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -669,68 +668,13 @@ static int effective_prio(task_t *p) return prio; } -#ifdef CONFIG_SMP -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias += MAX_PRIO - prio; -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias -= MAX_PRIO - prio; -} - -static inline void inc_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running++; - if (rt_task(p)) { - if (p != rq->migration_thread) - /* - * The migration thread does the actual balancing. Do - * not bias by its priority as the ultra high priority - * will skew balancing adversely. - */ - inc_prio_bias(rq, p->prio); - } else - inc_prio_bias(rq, p->static_prio); -} - -static inline void dec_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running--; - if (rt_task(p)) { - if (p != rq->migration_thread) - dec_prio_bias(rq, p->prio); - } else - dec_prio_bias(rq, p->static_prio); -} -#else -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ -} - -static inline void inc_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running++; -} - -static inline void dec_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running--; -} -#endif - /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - inc_nr_running(p, rq); + rq->nr_running++; } /* @@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); + rq->nr_running++; } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -863,7 +807,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - dec_nr_running(p, rq); + rq->nr_running--; dequeue_task(p, p->array); p->array = NULL; } @@ -1007,61 +951,27 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long __source_load(int cpu, int type, enum idle_type idle) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long source_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; - + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; if (type == 0) - source_load = load_now; - else - source_load = min(cpu_load, load_now); - - if (running > 1 || (idle == NOT_IDLE && running)) - /* - * If we are busy rebalancing the load is biased by - * priority to create 'nice' support across cpus. When - * idle rebalancing we should only bias the source_load if - * there is more than one task running on that queue to - * prevent idle rebalance from trying to pull tasks from a - * queue with only one running task. - */ - source_load = source_load * rq->prio_bias / running; + return load_now; - return source_load; -} - -static inline unsigned long source_load(int cpu, int type) -{ - return __source_load(cpu, type, NOT_IDLE); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long target_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; - + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; if (type == 0) - target_load = load_now; - else - target_load = max(cpu_load, load_now); - - if (running > 1 || (idle == NOT_IDLE && running)) - target_load = target_load * rq->prio_bias / running; + return load_now; - return target_load; -} - -static inline unsigned long target_load(int cpu, int type) -{ - return __target_load(cpu, type, NOT_IDLE); + return max(rq->cpu_load[type-1], load_now); } /* @@ -1294,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) } } - if (p->last_waker_cpu != this_cpu) - goto out_set_cpu; - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; @@ -1367,8 +1274,6 @@ out_set_cpu: cpu = task_cpu(p); } - p->last_waker_cpu = this_cpu; - out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { @@ -1450,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) - p->last_waker_cpu = cpu; -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; #endif -#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; @@ -1530,7 +1432,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - inc_nr_running(p, rq); + rq->nr_running++; } set_need_resched(); } else @@ -1875,9 +1777,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); + this_rq->nr_running++; enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -2056,9 +1958,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Bias balancing toward cpus of our domain */ if (local_group) - load = __target_load(i, load_idx, idle); + load = target_load(i, load_idx); else - load = __source_load(i, load_idx, idle); + load = source_load(i, load_idx); avg_load += load; } @@ -2171,7 +2073,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, int i; for_each_cpu_mask(i, group->cpumask) { - load = __source_load(i, 0, idle); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -3571,10 +3473,8 @@ void set_user_nice(task_t *p, long nice) goto out_unlock; } array = p->array; - if (array) { + if (array) dequeue_task(p, array); - dec_prio_bias(rq, p->static_prio); - } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -3584,7 +3484,6 @@ void set_user_nice(task_t *p, long nice) if (array) { enqueue_task(p, array); - inc_prio_bias(rq, p->static_prio); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4031,7 +3930,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) goto out_unlock; retval = 0; - cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: read_unlock(&tasklist_lock); @@ -5141,7 +5040,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, #define SEARCH_SCOPE 2 #define MIN_CACHE_SIZE (64*1024U) #define DEFAULT_CACHE_SIZE (5*1024*1024U) -#define ITERATIONS 2 +#define ITERATIONS 1 #define SIZE_THRESH 130 #define COST_THRESH 130 @@ -5159,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, #define MAX_DOMAIN_DISTANCE 32 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = +/* + * Architectures may override the migration cost and thus avoid + * boot-time calibration. Unit is nanoseconds. Mostly useful for + * virtualized hardware: + */ +#ifdef CONFIG_DEFAULT_MIGRATION_COST + CONFIG_DEFAULT_MIGRATION_COST +#else + -1LL +#endif +}; /* * Allow override of migration cost - in units of microseconds. @@ -5480,9 +5390,9 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) break; } /* - * Increase the cachesize in 5% steps: + * Increase the cachesize in 10% steps: */ - size = size * 20 / 19; + size = size * 10 / 9; } if (migration_debug) @@ -5551,13 +5461,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); + if (system_state == SYSTEM_BOOTING) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); } - printk("\n"); j1 = jiffies; if (migration_debug) printk("migration: %ld seconds\n", (j1-j0)/HZ); @@ -6109,7 +6021,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { prio_array_t *array; rq = cpu_rq(i); diff --git a/kernel/signal.c b/kernel/signal.c index d3efafd..ea15410 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -283,7 +283,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, return(q); } -static inline void __sigqueue_free(struct sigqueue *q) +static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; @@ -2430,7 +2430,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) } int -do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) +do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; sigset_t mask; @@ -2454,6 +2454,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) *oact = *k; if (act) { + sigdelsetmask(&act->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -2479,8 +2481,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) read_lock(&tasklist_lock); spin_lock_irq(&t->sighand->siglock); *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2495,8 +2495,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) } *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); } spin_unlock_irq(¤t->sighand->siglock); @@ -2702,6 +2700,7 @@ sys_signal(int sig, __sighandler_t handler) new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; + sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); diff --git a/kernel/sys.c b/kernel/sys.c index d09cac2..f91218a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -428,7 +428,7 @@ void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; - image = xchg(&kexec_image, 0); + image = xchg(&kexec_image, NULL); if (!image) { return; } @@ -440,23 +440,25 @@ void kernel_kexec(void) } EXPORT_SYMBOL_GPL(kernel_kexec); +void kernel_shutdown_prepare(enum system_states state) +{ + notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + device_shutdown(); +} /** * kernel_halt - halt the system * * Shutdown everything and perform a clean system halt. */ -void kernel_halt_prepare(void) -{ - notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_state = SYSTEM_HALT; - device_shutdown(); -} void kernel_halt(void) { - kernel_halt_prepare(); + kernel_shutdown_prepare(SYSTEM_HALT); printk(KERN_EMERG "System halted.\n"); machine_halt(); } + EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -464,20 +466,13 @@ EXPORT_SYMBOL_GPL(kernel_halt); * * Shutdown everything and perform a clean system power_off. */ -void kernel_power_off_prepare(void) -{ - notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_state = SYSTEM_POWER_OFF; - device_shutdown(); -} void kernel_power_off(void) { - kernel_power_off_prepare(); + kernel_shutdown_prepare(SYSTEM_POWER_OFF); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); - /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 17313b9..1067090 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -104,6 +104,8 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(compat_sys_ipc); +cond_syscall(compat_sys_sysctl); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb99a42..c05a2b7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,14 +44,12 @@ #include <linux/limits.h> #include <linux/dcache.h> #include <linux/syscalls.h> +#include <linux/nfs_fs.h> +#include <linux/acpi.h> #include <asm/uaccess.h> #include <asm/processor.h> -#ifdef CONFIG_ROOT_NFS -#include <linux/nfs_fs.h> -#endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -126,8 +124,6 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif -int randomize_va_space = 1; - static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -640,6 +636,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#if defined(CONFIG_MMU) { .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", @@ -648,6 +645,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .ctl_name = KERN_SPIN_RETRY, @@ -658,6 +656,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_ACPI_SLEEP + { + .ctl_name = KERN_ACPI_VIDEO_FLAGS, + .procname = "acpi_video_flags", + .data = &acpi_video_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; @@ -878,7 +886,17 @@ static ctl_table vm_table[] = { .maxlen = sizeof(zone_reclaim_mode), .mode = 0644, .proc_handler = &proc_dointvec, - .strategy = &zero, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, }, #endif { .ctl_name = 0 } diff --git a/kernel/time.c b/kernel/time.c index 7477b1d..8045391 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -155,7 +155,7 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) static int firsttime = 1; int error = 0; - if (!timespec_valid(tv)) + if (tv && !timespec_valid(tv)) return -EINVAL; error = security_settime(tv, tz); @@ -637,15 +637,16 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) * * Returns the timespec representation of the nsec parameter. */ -inline struct timespec ns_to_timespec(const nsec_t nsec) +struct timespec ns_to_timespec(const nsec_t nsec) { struct timespec ts; - if (nsec) - ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, - &ts.tv_nsec); - else - ts.tv_sec = ts.tv_nsec = 0; + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); + if (unlikely(nsec < 0)) + set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); return ts; } diff --git a/kernel/timer.c b/kernel/timer.c index 4f1cb0a..fe3a9a9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -495,7 +495,7 @@ unsigned long next_timer_interrupt(void) base = &__get_cpu_var(tvec_bases); spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); - list = 0; + list = NULL; /* Look for timer events in tv1. */ j = base->timer_jiffies & TVR_MASK; @@ -717,12 +717,16 @@ static void second_overflow(void) #endif } -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +/* + * Returns how many microseconds we need to add to xtime this tick + * in doing an adjustment requested with adjtime. + */ +static long adjtime_adjustment(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; - if ((time_adjust_step = time_adjust) != 0 ) { + time_adjust_step = time_adjust; + if (time_adjust_step) { /* * We are doing an adjtime thing. Prepare time_adjust_step to * be within bounds. Note that a positive time_adjust means we @@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void) */ time_adjust_step = min(time_adjust_step, (long)tickadj); time_adjust_step = max(time_adjust_step, (long)-tickadj); + } + return time_adjust_step; +} +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + long time_adjust_step, delta_nsec; + + time_adjust_step = adjtime_adjustment(); + if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - } delta_nsec = tick_nsec + time_adjust_step * 1000; /* * Advance the phase, once it gets to one microsecond, then @@ -759,6 +772,22 @@ static void update_wall_time_one_tick(void) } /* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 + * bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + long delta_nsec; + + delta_nsec = tick_nsec + adjtime_adjustment() * 1000; + return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; +} + +/* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, * we're doing this this way mainly for interrupt diff --git a/kernel/user.c b/kernel/user.c index 89e562f..d9deae4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> +#include <linux/interrupt.h> /* * UID task count cache, to get fast user lookup in "alloc_uid" @@ -27,6 +28,16 @@ static kmem_cache_t *uid_cachep; static struct list_head uidhash_table[UIDHASH_SZ]; + +/* + * The uidhash_lock is mostly taken from process context, but it is + * occasionally also taken from softirq/tasklet context, when + * task-structs get RCU-freed. Hence all locking must be softirq-safe. + * But free_uid() is also called with local interrupts disabled, and running + * local_bh_enable() with local interrupts disabled is an error - we'll run + * softirq callbacks, and they can unconditionally enable interrupts, and + * the caller of free_uid() didn't expect that.. + */ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { @@ -82,15 +93,19 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *has struct user_struct *find_user(uid_t uid) { struct user_struct *ret; + unsigned long flags; - spin_lock(&uidhash_lock); + spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); - spin_unlock(&uidhash_lock); + spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { + unsigned long flags; + + local_irq_save(flags); if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); key_put(up->uid_keyring); @@ -98,6 +113,7 @@ void free_uid(struct user_struct *up) kmem_cache_free(uid_cachep, up); spin_unlock(&uidhash_lock); } + local_irq_restore(flags); } struct user_struct * alloc_uid(uid_t uid) @@ -105,9 +121,9 @@ struct user_struct * alloc_uid(uid_t uid) struct list_head *hashent = uidhashentry(uid); struct user_struct *up; - spin_lock(&uidhash_lock); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); - spin_unlock(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); if (!up) { struct user_struct *new; @@ -137,7 +153,7 @@ struct user_struct * alloc_uid(uid_t uid) * Before adding this, check whether we raced * on adding the same user already.. */ - spin_lock(&uidhash_lock); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { key_put(new->uid_keyring); @@ -147,7 +163,7 @@ struct user_struct * alloc_uid(uid_t uid) uid_hash_insert(new, hashent); up = new; } - spin_unlock(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); } return up; @@ -183,9 +199,9 @@ static int __init uid_cache_init(void) INIT_LIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ - spin_lock(&uidhash_lock); + spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(0)); - spin_unlock(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); return 0; } |