diff options
author | Scott Wood <scottwood@freescale.com> | 2014-05-14 18:19:12 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2014-05-14 18:37:18 (GMT) |
commit | 86ba38e6f5f2fbfe9b49e153ea89593b26482019 (patch) | |
tree | f99d2906b0eafca507f37289e68052fc105cc2dc /kernel | |
parent | 07c8b57b111585a617b2b456497fc9b33c00743c (diff) | |
download | linux-fsl-qoriq-86ba38e6f5f2fbfe9b49e153ea89593b26482019.tar.xz |
Reset to 3.12.19
Diffstat (limited to 'kernel')
71 files changed, 869 insertions, 5858 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 8bb92eb..d2b32ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -222,4 +222,4 @@ endif config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES && !PREEMPT_RT_FULL + depends on SMP && !DEBUG_MUTEXES diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 11dbe26..3f9c974 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,16 +1,3 @@ -config PREEMPT - bool - select PREEMPT_COUNT - -config PREEMPT_RT_BASE - bool - select PREEMPT - -config HAVE_PREEMPT_LAZY - bool - -config PREEMPT_LAZY - def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL choice prompt "Preemption Model" @@ -46,9 +33,9 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT__LL +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" - select PREEMPT + select PREEMPT_COUNT select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making @@ -65,22 +52,6 @@ config PREEMPT__LL embedded system with latency requirements in the milliseconds range. -config PREEMPT_RTB - bool "Preemptible Kernel (Basic RT)" - select PREEMPT_RT_BASE - help - This option is basically the same as (Low-Latency Desktop) but - enables changes which are preliminary for the full preemptible - RT kernel. - -config PREEMPT_RT_FULL - bool "Fully Preemptible Kernel (RT)" - depends on IRQ_FORCED_THREADING - select PREEMPT_RT_BASE - select PREEMPT_RCU - help - All and everything - endchoice config PREEMPT_COUNT diff --git a/kernel/Makefile b/kernel/Makefile index b3ff0a8..1ce4755 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,10 +7,10 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o srcu.o semaphore.o \ + kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o lglock.o smpboot.o wait-simple.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -33,11 +33,7 @@ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -ifneq ($(CONFIG_PREEMPT_RT_FULL),y) -obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o -obj-y += rwsem.o -endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -49,7 +45,6 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o -obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c4f8bc79..1c204fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4410,16 +4410,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, rcu_assign_pointer(cgrp->name, name); /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) { - err = -ENOMEM; - goto err_free_name; - } - - /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and * cgroup_rmdir() are fully synchronized by i_mutex; however, do it @@ -4428,7 +4418,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_free_name; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4438,6 +4428,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; + } + init_cgroup_housekeeping(cgrp); dentry->d_fsdata = cgrp; @@ -4544,11 +4544,11 @@ err_free_all: ss->css_free(css); } } + idr_remove(&root->cgroup_idr, cgrp->id); +err_unlock: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); -err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: diff --git a/kernel/cpu.c b/kernel/cpu.c index ba7416b..d7f07a2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -63,290 +63,6 @@ static struct { .refcount = 0, }; -/** - * hotplug_pcp - per cpu hotplug descriptor - * @unplug: set when pin_current_cpu() needs to sync tasks - * @sync_tsk: the task that waits for tasks to finish pinned sections - * @refcount: counter of tasks in pinned sections - * @grab_lock: set when the tasks entering pinned sections should wait - * @synced: notifier for @sync_tsk to tell cpu_down it's finished - * @mutex: the mutex to make tasks wait (used when @grab_lock is true) - * @mutex_init: zero if the mutex hasn't been initialized yet. - * - * Although @unplug and @sync_tsk may point to the same task, the @unplug - * is used as a flag and still exists after @sync_tsk has exited and - * @sync_tsk set to NULL. - */ -struct hotplug_pcp { - struct task_struct *unplug; - struct task_struct *sync_tsk; - int refcount; - int grab_lock; - struct completion synced; - struct completion unplug_wait; -#ifdef CONFIG_PREEMPT_RT_FULL - /* - * Note, on PREEMPT_RT, the hotplug lock must save the state of - * the task, otherwise the mutex will cause the task to fail - * to sleep when required. (Because it's called from migrate_disable()) - * - * The spinlock_t on PREEMPT_RT is a mutex that saves the task's - * state. - */ - spinlock_t lock; -#else - struct mutex mutex; -#endif - int mutex_init; -}; - -#ifdef CONFIG_PREEMPT_RT_FULL -# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock) -# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock) -#else -# define hotplug_lock(hp) mutex_lock(&(hp)->mutex) -# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex) -#endif - -static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp); - -/** - * pin_current_cpu - Prevent the current cpu from being unplugged - * - * Lightweight version of get_online_cpus() to prevent cpu from being - * unplugged when code runs in a migration disabled region. - * - * Must be called with preemption disabled (preempt_count = 1)! - */ -void pin_current_cpu(void) -{ - struct hotplug_pcp *hp; - int force = 0; - -retry: - hp = &__get_cpu_var(hotplug_pcp); - - if (!hp->unplug || hp->refcount || force || preempt_count() > 1 || - hp->unplug == current) { - hp->refcount++; - return; - } - if (hp->grab_lock) { - preempt_enable(); - hotplug_lock(hp); - hotplug_unlock(hp); - } else { - preempt_enable(); - /* - * Try to push this task off of this CPU. - */ - if (!migrate_me()) { - preempt_disable(); - hp = &__get_cpu_var(hotplug_pcp); - if (!hp->grab_lock) { - /* - * Just let it continue it's already pinned - * or about to sleep. - */ - force = 1; - goto retry; - } - preempt_enable(); - } - } - preempt_disable(); - goto retry; -} - -/** - * unpin_current_cpu - Allow unplug of current cpu - * - * Must be called with preemption or interrupts disabled! - */ -void unpin_current_cpu(void) -{ - struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp); - - WARN_ON(hp->refcount <= 0); - - /* This is safe. sync_unplug_thread is pinned to this cpu */ - if (!--hp->refcount && hp->unplug && hp->unplug != current) - wake_up_process(hp->unplug); -} - -static void wait_for_pinned_cpus(struct hotplug_pcp *hp) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - while (hp->refcount) { - schedule_preempt_disabled(); - set_current_state(TASK_UNINTERRUPTIBLE); - } -} - -static int sync_unplug_thread(void *data) -{ - struct hotplug_pcp *hp = data; - - wait_for_completion(&hp->unplug_wait); - preempt_disable(); - hp->unplug = current; - wait_for_pinned_cpus(hp); - - /* - * This thread will synchronize the cpu_down() with threads - * that have pinned the CPU. When the pinned CPU count reaches - * zero, we inform the cpu_down code to continue to the next step. - */ - set_current_state(TASK_UNINTERRUPTIBLE); - preempt_enable(); - complete(&hp->synced); - - /* - * If all succeeds, the next step will need tasks to wait till - * the CPU is offline before continuing. To do this, the grab_lock - * is set and tasks going into pin_current_cpu() will block on the - * mutex. But we still need to wait for those that are already in - * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop() - * will kick this thread out. - */ - while (!hp->grab_lock && !kthread_should_stop()) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - /* Make sure grab_lock is seen before we see a stale completion */ - smp_mb(); - - /* - * Now just before cpu_down() enters stop machine, we need to make - * sure all tasks that are in pinned CPU sections are out, and new - * tasks will now grab the lock, keeping them from entering pinned - * CPU sections. - */ - if (!kthread_should_stop()) { - preempt_disable(); - wait_for_pinned_cpus(hp); - preempt_enable(); - complete(&hp->synced); - } - - set_current_state(TASK_UNINTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - set_current_state(TASK_RUNNING); - - /* - * Force this thread off this CPU as it's going down and - * we don't want any more work on this CPU. - */ - current->flags &= ~PF_NO_SETAFFINITY; - do_set_cpus_allowed(current, cpu_present_mask); - migrate_me(); - return 0; -} - -static void __cpu_unplug_sync(struct hotplug_pcp *hp) -{ - wake_up_process(hp->sync_tsk); - wait_for_completion(&hp->synced); -} - -static void __cpu_unplug_wait(unsigned int cpu) -{ - struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - - complete(&hp->unplug_wait); - wait_for_completion(&hp->synced); -} - -/* - * Start the sync_unplug_thread on the target cpu and wait for it to - * complete. - */ -static int cpu_unplug_begin(unsigned int cpu) -{ - struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - int err; - - /* Protected by cpu_hotplug.lock */ - if (!hp->mutex_init) { -#ifdef CONFIG_PREEMPT_RT_FULL - spin_lock_init(&hp->lock); -#else - mutex_init(&hp->mutex); -#endif - hp->mutex_init = 1; - } - - /* Inform the scheduler to migrate tasks off this CPU */ - tell_sched_cpu_down_begin(cpu); - - init_completion(&hp->synced); - init_completion(&hp->unplug_wait); - - hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu); - if (IS_ERR(hp->sync_tsk)) { - err = PTR_ERR(hp->sync_tsk); - hp->sync_tsk = NULL; - return err; - } - kthread_bind(hp->sync_tsk, cpu); - - /* - * Wait for tasks to get out of the pinned sections, - * it's still OK if new tasks enter. Some CPU notifiers will - * wait for tasks that are going to enter these sections and - * we must not have them block. - */ - wake_up_process(hp->sync_tsk); - return 0; -} - -static void cpu_unplug_sync(unsigned int cpu) -{ - struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - - init_completion(&hp->synced); - /* The completion needs to be initialzied before setting grab_lock */ - smp_wmb(); - - /* Grab the mutex before setting grab_lock */ - hotplug_lock(hp); - hp->grab_lock = 1; - - /* - * The CPU notifiers have been completed. - * Wait for tasks to get out of pinned CPU sections and have new - * tasks block until the CPU is completely down. - */ - __cpu_unplug_sync(hp); - - /* All done with the sync thread */ - kthread_stop(hp->sync_tsk); - hp->sync_tsk = NULL; -} - -static void cpu_unplug_done(unsigned int cpu) -{ - struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - - hp->unplug = NULL; - /* Let all tasks know cpu unplug is finished before cleaning up */ - smp_wmb(); - - if (hp->sync_tsk) - kthread_stop(hp->sync_tsk); - - if (hp->grab_lock) { - hotplug_unlock(hp); - /* protected by cpu_hotplug.lock */ - hp->grab_lock = 0; - } - tell_sched_cpu_down_done(cpu); -} - void get_online_cpus(void) { might_sleep(); @@ -363,14 +79,15 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); + } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -565,15 +282,13 @@ static int __ref take_cpu_down(void *_param) /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { - int mycpu, err, nr_calls = 0; + int err, nr_calls = 0; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { .mod = mod, .hcpu = hcpu, }; - cpumask_var_t cpumask; - cpumask_var_t cpumask_org; if (num_online_cpus() == 1) return -EBUSY; @@ -581,34 +296,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; - /* Move the downtaker off the unplug cpu */ - if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) - return -ENOMEM; - if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL)) { - free_cpumask_var(cpumask); - return -ENOMEM; - } - - cpumask_copy(cpumask_org, tsk_cpus_allowed(current)); - cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu)); - set_cpus_allowed_ptr(current, cpumask); - free_cpumask_var(cpumask); - migrate_disable(); - mycpu = smp_processor_id(); - if (mycpu == cpu) { - printk(KERN_ERR "Yuck! Still on unplug CPU\n!"); - migrate_enable(); - err = -EBUSY; - goto restore_cpus; - } - migrate_enable(); - cpu_hotplug_begin(); - err = cpu_unplug_begin(cpu); - if (err) { - printk("cpu_unplug_begin(%d) failed\n", cpu); - goto out_cancel; - } err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { @@ -618,13 +306,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } - - __cpu_unplug_wait(cpu); smpboot_park_threads(cpu); - /* Notifiers are done. Don't let any more tasks pin this CPU. */ - cpu_unplug_sync(cpu); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ @@ -653,14 +336,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); out_release: - cpu_unplug_done(cpu); -out_cancel: cpu_hotplug_done(); if (!err) cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); -restore_cpus: - set_cpus_allowed_ptr(current, cpumask_org); - free_cpumask_var(cpumask_org); return err; } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 399dba6..14ff484 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -554,6 +554,7 @@ int vkdb_printf(const char *fmt, va_list ap) int linecount; int colcount; int logging, saved_loglevel = 0; + int saved_trap_printk; int got_printf_lock = 0; int retlen = 0; int fnd, len; @@ -564,6 +565,8 @@ int vkdb_printf(const char *fmt, va_list ap) unsigned long uninitialized_var(flags); preempt_disable(); + saved_trap_printk = kdb_trap_printk; + kdb_trap_printk = 0; /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, @@ -830,6 +833,7 @@ kdb_print_out: } else { __release(kdb_printf_lock); } + kdb_trap_printk = saved_trap_printk; preempt_enable(); return retlen; } @@ -839,11 +843,9 @@ int kdb_printf(const char *fmt, ...) va_list ap; int r; - kdb_trap_printk++; va_start(ap, fmt); r = vkdb_printf(fmt, ap); va_end(ap); - kdb_trap_printk--; return r; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 420de7f..fea4f6c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6029,7 +6029,6 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; - hwc->hrtimer.irqsafe = 1; /* * Since hrtimers have a fixed rate, we can do a static freq->period diff --git a/kernel/exit.c b/kernel/exit.c index 7493b32..dcde2c4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -145,7 +145,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ - flush_task_sigqueue(tsk); + flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -559,9 +559,6 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -569,9 +566,19 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* + * We don't want people slaying init. + * + * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() + * can change ->exit_state to EXIT_ZOMBIE. If this is the final + * state, do_notify_parent() was already called and ->exit_signal + * doesn't matter. + */ p->exit_signal = SIGCHLD; + if (p->exit_state == EXIT_DEAD) + return; + /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { @@ -783,6 +790,8 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); @@ -798,13 +807,9 @@ void do_exit(long code) cgroup_exit(tsk, 1); - if (group_dead) - disassociate_ctty(1); - module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ diff --git a/kernel/fork.c b/kernel/fork.c index ae9a1a4..458953c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,7 +94,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; -DEFINE_RWLOCK(tasklist_lock); /* outer */ +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) @@ -230,9 +230,7 @@ static inline void put_signal_struct(struct signal_struct *sig) if (atomic_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } -#ifdef CONFIG_PREEMPT_RT_BASE -static -#endif + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); @@ -247,18 +245,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } -#ifndef CONFIG_PREEMPT_RT_BASE EXPORT_SYMBOL_GPL(__put_task_struct); -#else -void __put_task_struct_cb(struct rcu_head *rhp) -{ - struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu); - - __put_task_struct(tsk); - -} -EXPORT_SYMBOL_GPL(__put_task_struct_cb); -#endif void __init __weak arch_task_cache_init(void) { } @@ -611,19 +598,6 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); -#ifdef CONFIG_PREEMPT_RT_BASE -/* - * RCU callback for delayed mm drop. Not strictly rcu, but we don't - * want another facility to make this work. - */ -void __mmdrop_delayed(struct rcu_head *rhp) -{ - struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); - - __mmdrop(mm); -} -#endif - /* * Decrement the use count and release all resources for an mm. */ @@ -1133,9 +1107,6 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) */ static void posix_cpu_timers_init(struct task_struct *tsk) { -#ifdef CONFIG_PREEMPT_RT_BASE - tsk->posix_timer_list = NULL; -#endif tsk->cputime_expires.prof_exp = 0; tsk->cputime_expires.virt_exp = 0; tsk->cputime_expires.sched_exp = 0; @@ -1264,7 +1235,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); - p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; @@ -1272,8 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - raw_spin_lock_init(&p->vtime_lock); - seqcount_init(&p->vtime_seq); + seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; p->vtime_snap_whence = VTIME_SLEEPING; #endif @@ -1326,9 +1295,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->hardirq_context = 0; p->softirq_context = 0; #endif -#ifdef CONFIG_PREEMPT_RT_FULL - p->pagefault_disabled = 0; -#endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1690,7 +1656,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif diff --git a/kernel/futex.c b/kernel/futex.c index 3b85a95..d8347b7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -68,7 +68,9 @@ #include "rtmutex_common.h" +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG int __read_mostly futex_cmpxchg_enabled; +#endif #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -573,9 +575,7 @@ void exit_pi_state_list(struct task_struct *curr) * task still owns the PI-state: */ if (head->next != next) { - raw_spin_unlock_irq(&curr->pi_lock); spin_unlock(&hb->lock); - raw_spin_lock_irq(&curr->pi_lock); continue; } @@ -1449,16 +1449,6 @@ retry_private: requeue_pi_wake_futex(this, &key2, hb2); drop_count++; continue; - } else if (ret == -EAGAIN) { - /* - * Waiter was woken by timeout or - * signal and has set pi_blocked_on to - * PI_WAKEUP_INPROGRESS before we - * tried to enqueue it on the rtmutex. - */ - this->pi_state = NULL; - free_pi_state(pi_state); - continue; } else if (ret) { /* -EDEADLK */ this->pi_state = NULL; @@ -2302,7 +2292,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - struct futex_hash_bucket *hb, *hb2; + struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; int res, ret; @@ -2327,7 +2317,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - rt_mutex_init_waiter(&rt_waiter, false); + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2348,55 +2339,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - /* - * On RT we must avoid races with requeue and trying to block - * on two mutexes (hb->lock and uaddr2's rtmutex) by - * serializing access to pi_blocked_on with pi_lock. - */ - raw_spin_lock_irq(¤t->pi_lock); - if (current->pi_blocked_on) { - /* - * We have been requeued or are in the process of - * being requeued. - */ - raw_spin_unlock_irq(¤t->pi_lock); - } else { - /* - * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS - * prevents a concurrent requeue from moving us to the - * uaddr2 rtmutex. After that we can safely acquire - * (and possibly block on) hb->lock. - */ - current->pi_blocked_on = PI_WAKEUP_INPROGRESS; - raw_spin_unlock_irq(¤t->pi_lock); - - spin_lock(&hb->lock); - - /* - * Clean up pi_blocked_on. We might leak it otherwise - * when we succeeded with the hb->lock in the fast - * path. - */ - raw_spin_lock_irq(¤t->pi_lock); - current->pi_blocked_on = NULL; - raw_spin_unlock_irq(¤t->pi_lock); - - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out_put_keys; - } + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; /* - * In order to be here, we have either been requeued, are in - * the process of being requeued, or requeue successfully - * acquired uaddr2 on our behalf. If pi_blocked_on was - * non-null above, we may be racing with a requeue. Do not - * rely on q->lock_ptr to be hb2->lock until after blocking on - * hb->lock or hb2->lock. The futex_requeue dropped our key1 - * reference and incremented our key2 reference count. + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ - hb2 = hash_futex(&key2); /* Check if the requeue code acquired the second futex for us. */ if (!q.rt_waiter) { @@ -2405,10 +2361,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * did a lock-steal - fix up the PI-state in that case. */ if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(&hb2->lock); - BUG_ON(&hb2->lock != q.lock_ptr); + spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - spin_unlock(&hb2->lock); + spin_unlock(q.lock_ptr); } } else { /* @@ -2421,8 +2376,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); - spin_lock(&hb2->lock); - BUG_ON(&hb2->lock != q.lock_ptr); + spin_lock(q.lock_ptr); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2779,10 +2733,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; - int i; /* * This will fail and we want it. Some arch implementations do @@ -2796,6 +2750,14 @@ static int __init futex_init(void) */ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + int i; + + futex_detect_cmpxchg(); for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c19183d..383319b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,13 +47,11 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/timer.h> -#include <linux/kthread.h> #include <linux/freezer.h> #include <asm/uaccess.h> #include <trace/events/timer.h> -#include <trace/events/hist.h> /* * The timer bases: @@ -609,7 +607,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled at the end of the hrtimer_interrupt. + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. */ if (hrtimer_callback_running(timer)) return 0; @@ -644,9 +643,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, return res; } -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now); -static int hrtimer_rt_defer(struct hrtimer *timer); - /* * Initialize the high resolution related parts of cpu_base */ @@ -663,18 +659,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) * and expiry check is done in the hrtimer_interrupt or in the softirq. */ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { - if (!(base->cpu_base->hres_active && hrtimer_reprogram(timer, base))) - return 0; - if (!wakeup) - return -ETIME; -#ifdef CONFIG_PREEMPT_RT_BASE - if (!hrtimer_rt_defer(timer)) - return -ETIME; -#endif - return 1; + return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) @@ -742,44 +729,6 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * RT can not call schedule_work from real interrupt context. - * Need to make a thread to do the real work. - */ -static struct task_struct *clock_set_delay_thread; -static bool do_clock_set_delay; - -static int run_clock_set_delay(void *ignore) -{ - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (do_clock_set_delay) { - do_clock_set_delay = false; - schedule_work(&hrtimer_work); - } - schedule(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -void clock_was_set_delayed(void) -{ - do_clock_set_delay = true; - /* Make visible before waking up process */ - smp_wmb(); - wake_up_process(clock_set_delay_thread); -} - -static __init int create_clock_set_delay_thread(void) -{ - clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd"); - BUG_ON(!clock_set_delay_thread); - return 0; -} -early_initcall(create_clock_set_delay_thread); -#else /* PREEMPT_RT_FULL */ /* * Called from timekeeping and resume code to reprogramm the hrtimer * interrupt device on all cpus. @@ -788,7 +737,6 @@ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } -#endif #else @@ -798,18 +746,12 @@ static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void retrigger_next_event(void *arg) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -928,32 +870,6 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(hrtimer_forward); -#ifdef CONFIG_PREEMPT_RT_BASE -# define wake_up_timer_waiters(b) wake_up(&(b)->wait) - -/** - * hrtimer_wait_for_timer - Wait for a running timer - * - * @timer: timer to wait for - * - * The function waits in case the timers callback function is - * currently executed on the waitqueue of the timer base. The - * waitqueue is woken up after the timer callback function has - * finished execution. - */ -void hrtimer_wait_for_timer(const struct hrtimer *timer) -{ - struct hrtimer_clock_base *base = timer->base; - - if (base && base->cpu_base && !timer->irqsafe) - wait_event(base->cpu_base->wait, - !(timer->state & HRTIMER_STATE_CALLBACK)); -} - -#else -# define wake_up_timer_waiters(b) do { } while (0) -#endif - /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -997,11 +913,6 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - if (unlikely(!list_empty(&timer->cb_entry))) { - list_del_init(&timer->cb_entry); - goto out; - } - next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); if (&timer->node == next_timer) { @@ -1086,17 +997,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, #endif } -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - { - ktime_t now = new_base->get_time(); - - if (ktime_to_ns(tim) < ktime_to_ns(now)) - timer->praecox = now; - else - timer->praecox = ktime_set(0, 0); - } -#endif - hrtimer_set_expires_range_ns(timer, tim, delta_ns); timer_stats_hrtimer_set_start_info(timer); @@ -1109,19 +1009,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * * XXX send_remote_softirq() ? */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) { - ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup); - if (ret < 0) { - /* - * In case we failed to reprogram the timer (mostly - * because out current timer is already elapsed), - * remove it again and report a failure. This avoids - * stale base->first entries. - */ - debug_deactivate(timer); - __remove_hrtimer(timer, new_base, - timer->state & HRTIMER_STATE_CALLBACK, 0); - } else if (ret > 0) { + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) + && hrtimer_enqueue_reprogram(timer, new_base)) { + if (wakeup) { /* * We need to drop cpu_base->lock to avoid a * lock ordering issue vs. rq->lock. @@ -1129,7 +1019,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, raw_spin_unlock(&new_base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); local_irq_restore(flags); - return 0; + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); } } @@ -1219,7 +1111,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - hrtimer_wait_for_timer(timer); + cpu_relax(); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1298,7 +1190,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = hrtimer_clockid_to_base(clock_id); timer->base = &cpu_base->clock_base[base]; - INIT_LIST_HEAD(&timer->cb_entry); timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS @@ -1382,126 +1273,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } -static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer); - -#ifdef CONFIG_PREEMPT_RT_BASE -static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - /* - * Note, we clear the callback flag before we requeue the - * timer otherwise we trigger the callback_running() check - * in hrtimer_reprogram(). - */ - timer->state &= ~HRTIMER_STATE_CALLBACK; - - if (restart != HRTIMER_NORESTART) { - BUG_ON(hrtimer_active(timer)); - /* - * Enqueue the timer, if it's the leftmost timer then - * we need to reprogram it. - */ - if (!enqueue_hrtimer(timer, base)) - return; - -#ifndef CONFIG_HIGH_RES_TIMERS - } -#else - if (base->cpu_base->hres_active && - hrtimer_reprogram(timer, base)) - goto requeue; - - } else if (hrtimer_active(timer)) { - /* - * If the timer was rearmed on another CPU, reprogram - * the event device. - */ - if (&timer->node == base->active.next && - base->cpu_base->hres_active && - hrtimer_reprogram(timer, base)) - goto requeue; - } - return; - -requeue: - /* - * Timer is expired. Thus move it from tree to pending list - * again. - */ - __remove_hrtimer(timer, base, timer->state, 0); - list_add_tail(&timer->cb_entry, &base->expired); -#endif -} - -/* - * The changes in mainline which removed the callback modes from - * hrtimer are not yet working with -rt. The non wakeup_process() - * based callbacks which involve sleeping locks need to be treated - * seperately. - */ -static void hrtimer_rt_run_pending(void) -{ - enum hrtimer_restart (*fn)(struct hrtimer *); - struct hrtimer_cpu_base *cpu_base; - struct hrtimer_clock_base *base; - struct hrtimer *timer; - int index, restart; - - local_irq_disable(); - cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); - - raw_spin_lock(&cpu_base->lock); - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - - while (!list_empty(&base->expired)) { - timer = list_first_entry(&base->expired, - struct hrtimer, cb_entry); - - /* - * Same as the above __run_hrtimer function - * just we run with interrupts enabled. - */ - debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); - timer_stats_account_hrtimer(timer); - fn = timer->function; - - raw_spin_unlock_irq(&cpu_base->lock); - restart = fn(timer); - raw_spin_lock_irq(&cpu_base->lock); - - hrtimer_rt_reprogram(restart, timer, base); - } - } - - raw_spin_unlock_irq(&cpu_base->lock); - - wake_up_timer_waiters(cpu_base); -} - -static int hrtimer_rt_defer(struct hrtimer *timer) -{ - if (timer->irqsafe) - return 0; - - __remove_hrtimer(timer, timer->base, timer->state, 0); - list_add_tail(&timer->cb_entry, &timer->base->expired); - return 1; -} - -#else - -static inline void hrtimer_rt_run_pending(void) -{ - hrtimer_peek_ahead_timers(); -} - -static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; } - -#endif - #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1512,7 +1283,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires_next, now, entry_time, delta; - int i, retries = 0, raise = 0; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; @@ -1547,15 +1318,6 @@ retry: timer = container_of(node, struct hrtimer, node); - trace_hrtimer_interrupt(raw_smp_processor_id(), - ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ? - timer->praecox : hrtimer_get_expires(timer), - basenow)), - current, - timer->function == hrtimer_wakeup ? - container_of(timer, struct hrtimer_sleeper, - timer)->task : NULL); - /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the @@ -1581,10 +1343,7 @@ retry: break; } - if (!hrtimer_rt_defer(timer)) - __run_hrtimer(timer, &basenow); - else - raise = 1; + __run_hrtimer(timer, &basenow); } } @@ -1599,7 +1358,7 @@ retry: if (expires_next.tv64 == KTIME_MAX || !tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; - goto out; + return; } /* @@ -1643,9 +1402,6 @@ retry: tick_program_event(expires_next, 1); printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); -out: - if (raise) - raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* @@ -1681,16 +1437,40 @@ void hrtimer_peek_ahead_timers(void) __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ - -static void run_hrtimer_softirq(struct softirq_action *h) +/* + * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. + */ +void hrtimer_run_pending(void) { - hrtimer_rt_run_pending(); + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); } /* @@ -1701,18 +1481,11 @@ void hrtimer_run_queues(void) struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - int index, gettime = 1, raise = 0; + int index, gettime = 1; if (hrtimer_hres_active()) return; - /* - * Check whether we can switch to highres mode. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()) - && hrtimer_switch_to_hres()) - return; - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { base = &cpu_base->clock_base[index]; if (!timerqueue_getnext(&base->active)) @@ -1733,16 +1506,10 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - if (!hrtimer_rt_defer(timer)) - __run_hrtimer(timer, &base->softirq_time); - else - raise = 1; + __run_hrtimer(timer, &base->softirq_time); } raw_spin_unlock(&cpu_base->lock); } - - if (raise) - raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* @@ -1764,18 +1531,16 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; - sl->timer.irqsafe = 1; sl->task = task; } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode, - unsigned long state) +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { hrtimer_init_sleeper(t, current); do { - set_current_state(state); + set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); if (!hrtimer_active(&t->timer)) t->task = NULL; @@ -1819,8 +1584,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - /* cpu_chill() does not care about restart state. */ - if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE)) + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) goto out; rmtp = restart->nanosleep.rmtp; @@ -1837,10 +1601,8 @@ out: return ret; } -static long -__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid, - unsigned long state) +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1853,7 +1615,7 @@ __hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, hrtimer_init_on_stack(&t.timer, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode, state)) + if (do_nanosleep(&t, mode)) goto out; /* Absolute timers do not update the rmtp value and restart: */ @@ -1880,12 +1642,6 @@ out: return ret; } -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid) -{ - return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE); -} - SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { @@ -1900,26 +1656,6 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * Sleep for 1 ms in hope whoever holds what we want will let it go. - */ -void cpu_chill(void) -{ - struct timespec tu = { - .tv_nsec = NSEC_PER_MSEC, - }; - unsigned int freeze_flag = current->flags & PF_NOFREEZE; - - current->flags |= PF_NOFREEZE; - __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC, - TASK_UNINTERRUPTIBLE); - if (!freeze_flag) - current->flags &= ~PF_NOFREEZE; -} -EXPORT_SYMBOL(cpu_chill); -#endif - /* * Functions related to boot-time initialization: */ @@ -1931,13 +1667,9 @@ static void init_hrtimers_cpu(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; timerqueue_init_head(&cpu_base->clock_base[i].active); - INIT_LIST_HEAD(&cpu_base->clock_base[i].expired); } hrtimer_init_hres(cpu_base); -#ifdef CONFIG_PREEMPT_RT_BASE - init_waitqueue_head(&cpu_base->wait); -#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -2050,7 +1782,9 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 7f50c55..131ca17 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,8 +132,6 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - struct pt_regs *regs = get_irq_regs(); - u64 ip = regs ? instruction_pointer(regs) : 0; irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; @@ -174,11 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); -#ifndef CONFIG_PREEMPT_RT_FULL - add_interrupt_randomness(irq, flags, ip); -#else - desc->random_ip = ip; -#endif + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 252bf10..4c84746 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -22,7 +22,6 @@ #include "internals.h" #ifdef CONFIG_IRQ_FORCED_THREADING -# ifndef CONFIG_PREEMPT_RT_BASE __read_mostly bool force_irqthreads; static int __init setup_forced_irqthreads(char *arg) @@ -31,7 +30,6 @@ static int __init setup_forced_irqthreads(char *arg) return 0; } early_param("threadirqs", setup_forced_irqthreads); -# endif #endif /** @@ -164,62 +162,6 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_PREEMPT_RT_FULL -static void _irq_affinity_notify(struct irq_affinity_notify *notify); -static struct task_struct *set_affinity_helper; -static LIST_HEAD(affinity_list); -static DEFINE_RAW_SPINLOCK(affinity_list_lock); - -static int set_affinity_thread(void *unused) -{ - while (1) { - struct irq_affinity_notify *notify; - int empty; - - set_current_state(TASK_INTERRUPTIBLE); - - raw_spin_lock_irq(&affinity_list_lock); - empty = list_empty(&affinity_list); - raw_spin_unlock_irq(&affinity_list_lock); - - if (empty) - schedule(); - if (kthread_should_stop()) - break; - set_current_state(TASK_RUNNING); -try_next: - notify = NULL; - - raw_spin_lock_irq(&affinity_list_lock); - if (!list_empty(&affinity_list)) { - notify = list_first_entry(&affinity_list, - struct irq_affinity_notify, list); - list_del_init(¬ify->list); - } - raw_spin_unlock_irq(&affinity_list_lock); - - if (!notify) - continue; - _irq_affinity_notify(notify); - goto try_next; - } - return 0; -} - -static void init_helper_thread(void) -{ - if (set_affinity_helper) - return; - set_affinity_helper = kthread_run(set_affinity_thread, NULL, - "affinity-cb"); - WARN_ON(IS_ERR(set_affinity_helper)); -} -#else - -static inline void init_helper_thread(void) { } - -#endif - int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -238,17 +180,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - -#ifdef CONFIG_PREEMPT_RT_FULL - raw_spin_lock(&affinity_list_lock); - if (list_empty(&desc->affinity_notify->list)) - list_add_tail(&affinity_list, - &desc->affinity_notify->list); - raw_spin_unlock(&affinity_list_lock); - wake_up_process(set_affinity_helper); -#else schedule_work(&desc->affinity_notify->work); -#endif } irqd_set(data, IRQD_AFFINITY_SET); @@ -289,8 +221,10 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); -static void _irq_affinity_notify(struct irq_affinity_notify *notify) +static void irq_affinity_notify(struct work_struct *work) { + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; unsigned long flags; @@ -312,13 +246,6 @@ out: kref_put(¬ify->kref, notify->release); } -static void irq_affinity_notify(struct work_struct *work) -{ - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); - _irq_affinity_notify(notify); -} - /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification @@ -348,8 +275,6 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) notify->irq = irq; kref_init(¬ify->kref); INIT_WORK(¬ify->work, irq_affinity_notify); - INIT_LIST_HEAD(¬ify->list); - init_helper_thread(); } raw_spin_lock_irqsave(&desc->lock, flags); @@ -856,15 +781,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action); - /* - * Interrupts which have real time requirements can be set up - * to avoid softirq processing in the thread handler. This is - * safe as these interrupts do not raise soft interrupts. - */ - if (irq_settings_no_softirq_call(desc)) - _local_bh_enable(); - else - local_bh_enable(); + local_bh_enable(); return ret; } @@ -947,12 +864,6 @@ static int irq_thread(void *data) if (!noirqdebug) note_interrupt(action->irq, desc, action_ret); -#ifdef CONFIG_PREEMPT_RT_FULL - migrate_disable(); - add_interrupt_randomness(action->irq, 0, - desc->random_ip ^ (unsigned long) action); - migrate_enable(); -#endif wake_threads_waitq(desc); } @@ -1215,9 +1126,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } - if (new->flags & IRQF_NO_SOFTIRQ_CALL) - irq_settings_set_no_softirq_call(desc); - /* Set default affinity mask once everything is setup */ setup_affinity(irq, desc, mask); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0d2c381..1162f10 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,7 +14,6 @@ enum { _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, - _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -27,7 +26,6 @@ enum { #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON -#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -38,16 +36,6 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } -static inline bool irq_settings_no_softirq_call(struct irq_desc *desc) -{ - return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL; -} - -static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc) -{ - desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL; -} - static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_PER_CPU; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e5a309a..7b5f012 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -340,10 +340,6 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { -#ifdef CONFIG_PREEMPT_RT_BASE - pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n"); - return 1; -#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -356,10 +352,6 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { -#ifdef CONFIG_PREEMPT_RT_BASE - pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n"); - return 1; -#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 35d21f9..55fcce6 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -20,9 +20,6 @@ static DEFINE_PER_CPU(struct llist_head, irq_work_list); -#ifdef CONFIG_PREEMPT_RT_FULL -static DEFINE_PER_CPU(struct llist_head, hirq_work_list); -#endif static DEFINE_PER_CPU(int, irq_work_raised); /* @@ -51,11 +48,7 @@ static bool irq_work_claim(struct irq_work *work) return true; } -#ifdef CONFIG_PREEMPT_RT_FULL -void arch_irq_work_raise(void) -#else void __weak arch_irq_work_raise(void) -#endif { /* * Lame architectures will get the timer tick callback @@ -77,12 +70,8 @@ void irq_work_queue(struct irq_work *work) /* Queue the entry and raise the IPI if needed. */ preempt_disable(); -#ifdef CONFIG_PREEMPT_RT_FULL - if (work->flags & IRQ_WORK_HARD_IRQ) - llist_add(&work->llnode, &__get_cpu_var(hirq_work_list)); - else -#endif - llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + /* * If the work is not "lazy" or the tick is stopped, raise the irq * work interrupt (if supported by the arch), otherwise, just wait @@ -126,18 +115,12 @@ static void __irq_work_run(void) __this_cpu_write(irq_work_raised, 0); barrier(); -#ifdef CONFIG_PREEMPT_RT_FULL - if (in_irq()) - this_list = &__get_cpu_var(hirq_work_list); - else -#endif - this_list = &__get_cpu_var(irq_work_list); + this_list = &__get_cpu_var(irq_work_list); if (llist_empty(this_list)) return; -#ifndef CONFIG_PREEMPT_RT_FULL BUG_ON(!irqs_disabled()); -#endif + llnode = llist_del_all(this_list); while (llnode != NULL) { work = llist_entry(llnode, struct irq_work, llnode); @@ -169,9 +152,7 @@ static void __irq_work_run(void) */ void irq_work_run(void) { -#ifndef CONFIG_PREEMPT_RT_FULL BUG_ON(!in_irq()); -#endif __irq_work_run(); } EXPORT_SYMBOL_GPL(irq_work_run); diff --git a/kernel/itimer.c b/kernel/itimer.c index d051390..8d262b4 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -213,7 +213,6 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); - hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b66ab9e..9659d38 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -132,15 +132,6 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_KEXEC */ -#if defined(CONFIG_PREEMPT_RT_FULL) -static ssize_t realtime_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", 1); -} -KERNEL_ATTR_RO(realtime); -#endif - /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -206,9 +197,6 @@ static struct attribute * kernel_attrs[] = { &vmcoreinfo_attr.attr, #endif &rcu_expedited_attr.attr, -#ifdef CONFIG_PREEMPT_RT_FULL - &realtime_attr.attr, -#endif NULL }; diff --git a/kernel/lglock.c b/kernel/lglock.c index f2356df..86ae2ae 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -4,15 +4,6 @@ #include <linux/cpu.h> #include <linux/string.h> -#ifndef CONFIG_PREEMPT_RT_FULL -# define lg_lock_ptr arch_spinlock_t -# define lg_do_lock(l) arch_spin_lock(l) -# define lg_do_unlock(l) arch_spin_unlock(l) -#else -# define lg_lock_ptr struct rt_mutex -# define lg_do_lock(l) __rt_spin_lock(l) -# define lg_do_unlock(l) __rt_spin_unlock(l) -#endif /* * Note there is no uninit, so lglocks cannot be defined in * modules (but it's fine to use them from there) @@ -21,60 +12,51 @@ void lg_lock_init(struct lglock *lg, char *name) { -#ifdef CONFIG_PREEMPT_RT_FULL - int i; - - for_each_possible_cpu(i) { - struct rt_mutex *lock = per_cpu_ptr(lg->lock, i); - - rt_mutex_init(lock); - } -#endif LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); } EXPORT_SYMBOL(lg_lock_init); void lg_local_lock(struct lglock *lg) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; - migrate_disable(); + preempt_disable(); lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); - lg_do_lock(lock); + arch_spin_lock(lock); } EXPORT_SYMBOL(lg_local_lock); void lg_local_unlock(struct lglock *lg) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); - lg_do_unlock(lock); - migrate_enable(); + arch_spin_unlock(lock); + preempt_enable(); } EXPORT_SYMBOL(lg_local_unlock); void lg_local_lock_cpu(struct lglock *lg, int cpu) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; - preempt_disable_nort(); + preempt_disable(); lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); - lg_do_lock(lock); + arch_spin_lock(lock); } EXPORT_SYMBOL(lg_local_lock_cpu); void lg_local_unlock_cpu(struct lglock *lg, int cpu) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); - lg_do_unlock(lock); - preempt_enable_nort(); + arch_spin_unlock(lock); + preempt_enable(); } EXPORT_SYMBOL(lg_local_unlock_cpu); @@ -82,12 +64,12 @@ void lg_global_lock(struct lglock *lg) { int i; - preempt_disable_nort(); + preempt_disable(); lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); - lg_do_lock(lock); + arch_spin_lock(lock); } } EXPORT_SYMBOL(lg_global_lock); @@ -98,10 +80,10 @@ void lg_global_unlock(struct lglock *lg) lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { - lg_lock_ptr *lock; + arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); - lg_do_unlock(lock); + arch_spin_unlock(lock); } - preempt_enable_nort(); + preempt_enable(); } EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b74f7a5..e16c45b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3541,7 +3541,6 @@ static void check_flags(unsigned long flags) } } -#ifndef CONFIG_PREEMPT_RT_FULL /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -3556,7 +3555,6 @@ static void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } -#endif if (!debug_locks) print_irqtrace_events(current); diff --git a/kernel/panic.c b/kernel/panic.c index 936d00f..b6c482c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -368,11 +368,9 @@ static u64 oops_id; static int init_oops_id(void) { -#ifndef CONFIG_PREEMPT_RT_FULL if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); else -#endif oops_id++; return 0; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4208655..55e9560 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -312,7 +312,9 @@ static void *pidns_get(struct task_struct *task) struct pid_namespace *ns; rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); + ns = task_active_pid_ns(task); + if (ns) + get_pid_ns(ns); rcu_read_unlock(); return ns; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4bf82f8..c7f31aa 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -3,7 +3,6 @@ */ #include <linux/sched.h> -#include <linux/sched/rt.h> #include <linux/posix-timers.h> #include <linux/errno.h> #include <linux/math64.h> @@ -664,7 +663,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON_NONRT(!irqs_disabled()); + BUG_ON(!irqs_disabled()); ret = 0; old_incr = timer->it.cpu.incr; @@ -1111,7 +1110,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - BUG_ON_NONRT(!irqs_disabled()); + BUG_ON(!irqs_disabled()); arm_timer(timer); spin_unlock(&p->sighand->siglock); @@ -1178,11 +1177,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) sig = tsk->signal; if (sig->cputimer.running) { struct task_cputime group_sample; - unsigned long flags; - raw_spin_lock_irqsave(&sig->cputimer.lock, flags); + raw_spin_lock(&sig->cputimer.lock); group_sample = sig->cputimer.cputime; - raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags); + raw_spin_unlock(&sig->cputimer.lock); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1196,13 +1194,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -static void __run_posix_cpu_timers(struct task_struct *tsk) +void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; unsigned long flags; - BUG_ON_NONRT(!irqs_disabled()); + BUG_ON(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1267,190 +1265,6 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) posix_cpu_timer_kick_nohz(); } -#ifdef CONFIG_PREEMPT_RT_BASE -#include <linux/kthread.h> -#include <linux/cpu.h> -DEFINE_PER_CPU(struct task_struct *, posix_timer_task); -DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); - -static int posix_cpu_timers_thread(void *data) -{ - int cpu = (long)data; - - BUG_ON(per_cpu(posix_timer_task,cpu) != current); - - while (!kthread_should_stop()) { - struct task_struct *tsk = NULL; - struct task_struct *next = NULL; - - if (cpu_is_offline(cpu)) - goto wait_to_die; - - /* grab task list */ - raw_local_irq_disable(); - tsk = per_cpu(posix_timer_tasklist, cpu); - per_cpu(posix_timer_tasklist, cpu) = NULL; - raw_local_irq_enable(); - - /* its possible the list is empty, just return */ - if (!tsk) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - continue; - } - - /* Process task list */ - while (1) { - /* save next */ - next = tsk->posix_timer_list; - - /* run the task timers, clear its ptr and - * unreference it - */ - __run_posix_cpu_timers(tsk); - tsk->posix_timer_list = NULL; - put_task_struct(tsk); - - /* check if this is the last on the list */ - if (next == tsk) - break; - tsk = next; - } - } - return 0; - -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static inline int __fastpath_timer_check(struct task_struct *tsk) -{ - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - - if (!task_cputime_zero(&tsk->cputime_expires)) - return 1; - - if (!task_cputime_zero(&tsk->signal->cputime_expires)) - return 1; - - return 0; -} - -void run_posix_cpu_timers(struct task_struct *tsk) -{ - unsigned long cpu = smp_processor_id(); - struct task_struct *tasklist; - - BUG_ON(!irqs_disabled()); - if(!per_cpu(posix_timer_task, cpu)) - return; - /* get per-cpu references */ - tasklist = per_cpu(posix_timer_tasklist, cpu); - - /* check to see if we're already queued */ - if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { - get_task_struct(tsk); - if (tasklist) { - tsk->posix_timer_list = tasklist; - } else { - /* - * The list is terminated by a self-pointing - * task_struct - */ - tsk->posix_timer_list = tsk; - } - per_cpu(posix_timer_tasklist, cpu) = tsk; - - wake_up_process(per_cpu(posix_timer_task, cpu)); - } -} - -/* - * posix_cpu_thread_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int posix_cpu_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - struct task_struct *p; - struct sched_param param; - - switch (action) { - case CPU_UP_PREPARE: - p = kthread_create(posix_cpu_timers_thread, hcpu, - "posixcputmr/%d",cpu); - if (IS_ERR(p)) - return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; - kthread_bind(p, cpu); - /* Must be high prio to avoid getting starved */ - param.sched_priority = MAX_RT_PRIO-1; - sched_setscheduler(p, SCHED_FIFO, ¶m); - per_cpu(posix_timer_task,cpu) = p; - break; - case CPU_ONLINE: - /* Strictly unneccessary, as first user will wake it. */ - wake_up_process(per_cpu(posix_timer_task,cpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(per_cpu(posix_timer_task, cpu), - cpumask_any(cpu_online_mask)); - kthread_stop(per_cpu(posix_timer_task,cpu)); - per_cpu(posix_timer_task,cpu) = NULL; - break; - case CPU_DEAD: - kthread_stop(per_cpu(posix_timer_task,cpu)); - per_cpu(posix_timer_task,cpu) = NULL; - break; -#endif - } - return NOTIFY_OK; -} - -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. - */ -static struct notifier_block posix_cpu_thread_notifier = { - .notifier_call = posix_cpu_thread_call, - .priority = 10 -}; - -static int __init posix_cpu_thread_init(void) -{ - void *hcpu = (void *)(long)smp_processor_id(); - /* Start one for boot CPU. */ - unsigned long cpu; - - /* init the per-cpu posix_timer_tasklets */ - for_each_possible_cpu(cpu) - per_cpu(posix_timer_tasklist, cpu) = NULL; - - posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu); - posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu); - register_cpu_notifier(&posix_cpu_thread_notifier); - return 0; -} -early_initcall(posix_cpu_thread_init); -#else /* CONFIG_PREEMPT_RT_BASE */ -void run_posix_cpu_timers(struct task_struct *tsk) -{ - __run_posix_cpu_timers(tsk); -} -#endif /* CONFIG_PREEMPT_RT_BASE */ - /* * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a22b931..424c2d4 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -497,7 +497,6 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - int sig = event->sigev_signo; if ((event->sigev_notify & SIGEV_THREAD_ID ) && (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || @@ -506,8 +505,7 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) || - sig_kernel_coredump(sig))) + ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) return NULL; return task_pid(rtn); @@ -818,20 +816,6 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return overrun; } -/* - * Protected by RCU! - */ -static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr) -{ -#ifdef CONFIG_PREEMPT_RT_FULL - if (kc->timer_set == common_timer_set) - hrtimer_wait_for_timer(&timr->it.real.timer); - else - /* FIXME: Whacky hack for posix-cpu-timers */ - schedule_timeout(1); -#endif -} - /* Set a POSIX.1b interval timer. */ /* timr->it_lock is taken. */ static int @@ -909,7 +893,6 @@ retry: if (!timr) return -EINVAL; - rcu_read_lock(); kc = clockid_to_kclock(timr->it_clock); if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -918,12 +901,9 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { - timer_wait_for_callback(kc, timr); rtn = NULL; // We already got the old time... - rcu_read_unlock(); goto retry; } - rcu_read_unlock(); if (old_setting && !error && copy_to_user(old_setting, &old_spec, sizeof (old_spec))) @@ -961,15 +941,10 @@ retry_delete: if (!timer) return -EINVAL; - rcu_read_lock(); if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); - timer_wait_for_callback(clockid_to_kclock(timer->it_clock), - timer); - rcu_read_unlock(); goto retry_delete; } - rcu_read_unlock(); spin_lock(¤t->sighand->siglock); list_del(&timer->list); @@ -995,18 +970,8 @@ static void itimer_delete(struct k_itimer *timer) retry_delete: spin_lock_irqsave(&timer->it_lock, flags); - /* On RT we can race with a deletion */ - if (!timer->it_signal) { - unlock_timer(timer, flags); - return; - } - if (timer_delete_hook(timer) == TIMER_RETRY) { - rcu_read_lock(); unlock_timer(timer, flags); - timer_wait_for_callback(clockid_to_kclock(timer->it_clock), - timer); - rcu_read_unlock(); goto retry_delete; } list_del(&timer->list); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index d26958b..0121dab 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -275,8 +275,6 @@ static int create_image(int platform_mode) local_irq_disable(); - system_state = SYSTEM_SUSPEND; - error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " @@ -304,7 +302,6 @@ static int create_image(int platform_mode) syscore_resume(); Enable_irqs: - system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -430,7 +427,6 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); - system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) @@ -464,7 +460,6 @@ static int resume_target_kernel(bool platform_mode) syscore_resume(); Enable_irqs: - system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -547,7 +542,6 @@ int hibernation_platform_enter(void) goto Platform_finish; local_irq_disable(); - system_state = SYSTEM_SUSPEND; syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; @@ -560,7 +554,6 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); - system_state = SYSTEM_RUNNING; local_irq_enable(); enable_nonboot_cpus(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e6703bb..62ee437 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -218,8 +218,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); - system_state = SYSTEM_SUSPEND; - error = syscore_suspend(); if (!error) { *wakeup = pm_wakeup_pending(); @@ -230,8 +228,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) syscore_resume(); } - system_state = SYSTEM_RUNNING; - arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0a63f7b..c59896c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1029,7 +1029,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) { char *text; int len = 0; - int attempts = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) @@ -1041,14 +1040,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 seq; u32 idx; enum log_flags prev; - int num_msg; -try_again: - attempts++; - if (attempts > 10) { - len = -EBUSY; - goto out; - } - num_msg = 0; + if (clear_seq < log_first_seq) { /* messages are gone, move to first available one */ clear_seq = log_first_seq; @@ -1069,14 +1061,6 @@ try_again: prev = msg->flags; idx = log_next(idx); seq++; - num_msg++; - if (num_msg > 5) { - num_msg = 0; - raw_spin_unlock_irq(&logbuf_lock); - raw_spin_lock_irq(&logbuf_lock); - if (clear_seq < log_first_seq) - goto try_again; - } } /* move first record forward until length fits into the buffer */ @@ -1090,21 +1074,12 @@ try_again: prev = msg->flags; idx = log_next(idx); seq++; - num_msg++; - if (num_msg > 5) { - num_msg = 0; - raw_spin_unlock_irq(&logbuf_lock); - raw_spin_lock_irq(&logbuf_lock); - if (clear_seq < log_first_seq) - goto try_again; - } } /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; - prev = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); int textlen; @@ -1139,7 +1114,6 @@ try_again: clear_seq = log_next_seq; clear_idx = log_next_idx; } -out: raw_spin_unlock_irq(&logbuf_lock); kfree(text); @@ -1297,7 +1271,6 @@ static void call_console_drivers(int level, const char *text, size_t len) if (!console_drivers) return; - migrate_disable(); for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1310,7 +1283,6 @@ static void call_console_drivers(int level, const char *text, size_t len) continue; con->write(con, text, len); } - migrate_enable(); } /* @@ -1370,18 +1342,12 @@ static inline int can_use_console(unsigned int cpu) * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int console_trylock_for_printk(unsigned int cpu, unsigned long flags) +static int console_trylock_for_printk(unsigned int cpu) __releases(&logbuf_lock) { int retval = 0, wake = 0; -#ifdef CONFIG_PREEMPT_RT_FULL - int lock = !early_boot_irqs_disabled && !irqs_disabled_flags(flags) && - (preempt_count() <= 1); -#else - int lock = 1; -#endif - if (lock && console_trylock()) { + if (console_trylock()) { retval = 1; /* @@ -1521,62 +1487,6 @@ static size_t cont_print_text(char *text, size_t size) return textlen; } -#ifdef CONFIG_EARLY_PRINTK -struct console *early_console; - -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - -asmlinkage void early_printk(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - early_vprintk(fmt, ap); - va_end(ap); -} - -/* - * This is independent of any log levels - a global - * kill switch that turns off all of printk. - * - * Used by the NMI watchdog if early-printk is enabled. - */ -static bool __read_mostly printk_killswitch; - -static int __init force_early_printk_setup(char *str) -{ - printk_killswitch = true; - return 0; -} -early_param("force_early_printk", force_early_printk_setup); - -void printk_kill(void) -{ - printk_killswitch = true; -} - -static int forced_early_printk(const char *fmt, va_list ap) -{ - if (!printk_killswitch) - return 0; - early_vprintk(fmt, ap); - return 1; -} -#else -static inline int forced_early_printk(const char *fmt, va_list ap) -{ - return 0; -} -#endif - asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) @@ -1590,13 +1500,6 @@ asmlinkage int vprintk_emit(int facility, int level, int this_cpu; int printed_len = 0; - /* - * Fall back to early_printk if a debugging subsystem has - * killed printk output - */ - if (unlikely(forced_early_printk(fmt, args))) - return 1; - boot_delay_msec(level); printk_delay(); @@ -1716,15 +1619,8 @@ asmlinkage int vprintk_emit(int facility, int level, * The console_trylock_for_printk() function will release 'logbuf_lock' * regardless of whether it actually gets the console semaphore or not. */ - if (console_trylock_for_printk(this_cpu, flags)) { -#ifndef CONFIG_PREEMPT_RT_FULL + if (console_trylock_for_printk(this_cpu)) console_unlock(); -#else - raw_local_irq_restore(flags); - console_unlock(); - raw_local_irq_save(flags); -#endif - } lockdep_on(); out_restore_irqs: @@ -1826,6 +1722,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) { @@ -2066,16 +1985,11 @@ static void console_cont_flush(char *text, size_t size) goto out; len = cont_print_text(text, size); -#ifndef CONFIG_PREEMPT_RT_FULL raw_spin_unlock(&logbuf_lock); stop_critical_timings(); call_console_drivers(cont.level, text, len); start_critical_timings(); local_irq_restore(flags); -#else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - call_console_drivers(cont.level, text, len); -#endif return; out: raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -2158,17 +2072,12 @@ skip: console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; - -#ifndef CONFIG_PREEMPT_RT_FULL raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(level, text, len); start_critical_timings(); local_irq_restore(flags); -#else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - call_console_drivers(level, text, len); -#endif } console_locked = 0; mutex_release(&console_lock_dep_map, 1, _RET_IP_); @@ -2880,7 +2789,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fddaf65..1f4bcb3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -135,12 +135,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !__fatal_signal_pending(task)) { - raw_spin_lock_irq(&task->pi_lock); - if (task->state & __TASK_TRACED) - task->state = __TASK_TRACED; - else - task->saved_state = __TASK_TRACED; - raw_spin_unlock_irq(&task->pi_lock); + task->state = __TASK_TRACED; ret = true; } spin_unlock_irq(&task->sighand->siglock); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 7e1dd3e..b02a339 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -129,7 +129,6 @@ int notrace debug_lockdep_rcu_enabled(void) } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -#ifndef CONFIG_PREEMPT_RT_FULL /** * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * @@ -156,7 +155,6 @@ int rcu_read_lock_bh_held(void) return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -#endif #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index f202b26..9ed6075 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -369,7 +369,6 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_sched); -#ifndef CONFIG_PREEMPT_RT_FULL /* * Post an RCU bottom-half callback to be invoked after any subsequent * quiescent state. @@ -379,7 +378,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); -#endif void rcu_init(void) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 507fab1..32618b3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -55,11 +55,6 @@ #include <linux/random.h> #include <linux/ftrace_event.h> #include <linux/suspend.h> -#include <linux/delay.h> -#include <linux/gfp.h> -#include <linux/oom.h> -#include <linux/smpboot.h> -#include "time/tick-internal.h" #include "rcutree.h" #include <trace/events/rcu.h> @@ -150,6 +145,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +#ifdef CONFIG_RCU_BOOST + /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. @@ -159,6 +156,8 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); +#endif /* #ifdef CONFIG_RCU_BOOST */ + static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -200,19 +199,6 @@ void rcu_sched_qs(int cpu) rdp->passed_quiesce = 1; } -#ifdef CONFIG_PREEMPT_RT_FULL -static void rcu_preempt_qs(int cpu); - -void rcu_bh_qs(int cpu) -{ - unsigned long flags; - - /* Callers to this function, rcu_preempt_qs(), must disable irqs. */ - local_irq_save(flags); - rcu_preempt_qs(cpu); - local_irq_restore(flags); -} -#else void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); @@ -221,7 +207,6 @@ void rcu_bh_qs(int cpu) trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } -#endif /* * Note a context switch. This is a quiescent state for RCU-sched, @@ -278,7 +263,6 @@ long rcu_batches_completed_sched(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); -#ifndef CONFIG_PREEMPT_RT_FULL /* * Return the number of RCU BH batches processed thus far for debug & stats. */ @@ -296,7 +280,6 @@ void rcu_bh_force_quiescent_state(void) force_quiescent_state(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); -#endif /* * Record the number of times rcutorture tests have been initiated and @@ -1488,7 +1471,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { - swait_event_interruptible(rsp->gp_wq, + wait_event_interruptible(rsp->gp_wq, rsp->gp_flags & RCU_GP_FLAG_INIT); if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && @@ -1507,7 +1490,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } for (;;) { rsp->jiffies_force_qs = jiffies + j; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = wait_event_interruptible_timeout(rsp->gp_wq, (rsp->gp_flags & RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), @@ -1545,7 +1528,7 @@ static void rsp_wakeup(struct irq_work *work) struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); /* Wake up rcu_gp_kthread() to start the grace period. */ - swait_wake(&rsp->gp_wq); + wake_up(&rsp->gp_wq); } /* @@ -1619,7 +1602,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - swait_wake(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -2189,8 +2172,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } rsp->gp_flags |= RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - /* Memory barrier implied by wake_up() path. */ - swait_wake(&rsp->gp_wq); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -2227,14 +2209,16 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(void) +static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; if (cpu_is_offline(smp_processor_id())) return; + trace_rcu_utilization(TPS("Start RCU core")); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); + trace_rcu_utilization(TPS("End RCU core")); } /* @@ -2248,105 +2232,18 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) return; - rcu_do_batch(rsp, rdp); -} - -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) - wake_up_process(t); -} - -/* - * Wake up this CPU's rcuc kthread to do RCU core processing. - */ -static void invoke_rcu_core(void) -{ - unsigned long flags; - struct task_struct *t; - - if (!cpu_online(smp_processor_id())) + if (likely(!rsp->boost)) { + rcu_do_batch(rsp, rdp); return; - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - t = __this_cpu_read(rcu_cpu_kthread_task); - if (t != NULL && current != t) - rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status)); - local_irq_restore(flags); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * RCU softirq used in flavors and configurations of RCU that do not - * support RCU priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); - char work, *workp = &__get_cpu_var(rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_process_callbacks(); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; + invoke_rcu_callbacks_kthread(); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - -/* - * Spawn per-CPU RCU core processing kthreads. - */ -static int __init rcu_spawn_core_kthreads(void) +static void invoke_rcu_core(void) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - return 0; + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } -early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -2476,7 +2373,6 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_sched); -#ifndef CONFIG_PREEMPT_RT_FULL /* * Queue an RCU callback for invocation after a quicker grace period. */ @@ -2485,7 +2381,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); -#endif /* * Because a context switch is a grace period for RCU-sched and RCU-bh, @@ -2563,7 +2458,6 @@ void synchronize_sched(void) } EXPORT_SYMBOL_GPL(synchronize_sched); -#ifndef CONFIG_PREEMPT_RT_FULL /** * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. * @@ -2590,7 +2484,6 @@ void synchronize_rcu_bh(void) wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -#endif static int synchronize_sched_expedited_cpu_stop(void *data) { @@ -2765,10 +2658,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); - /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu(rsp)) - return 0; - /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -3002,7 +2891,6 @@ static void _rcu_barrier(struct rcu_state *rsp) mutex_unlock(&rsp->barrier_mutex); } -#ifndef CONFIG_PREEMPT_RT_FULL /** * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. */ @@ -3011,7 +2899,6 @@ void rcu_barrier_bh(void) _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); -#endif /** * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. @@ -3315,7 +3202,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - init_swait_head(&rsp->gp_wq); + init_waitqueue_head(&rsp->gp_wq); init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { @@ -3411,6 +3298,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1df8d9e..52be957 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -28,7 +28,6 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> #include <linux/irq_work.h> -#include <linux/wait-simple.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -201,7 +200,7 @@ struct rcu_node { /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU - struct swait_head nocb_gp_wq[2]; + wait_queue_head_t nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; @@ -334,7 +333,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* (approximate). */ int nocb_p_count; /* # CBs being invoked by kthread */ int nocb_p_count_lazy; /* (approximate). */ - struct swait_head nocb_wq; /* For nocb kthreads to sleep on. */ + wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -404,7 +403,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - struct swait_head gp_wq; /* Where GP task waits. */ + wait_queue_head_t gp_wq; /* Where GP task waits. */ int gp_flags; /* Commands for GP task. */ /* End of fields guarded by root rcu_node's lock. */ @@ -528,9 +527,10 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); -static void rcu_cpu_kthread_setup(unsigned int cpu); #ifdef CONFIG_RCU_BOOST +static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -564,7 +564,6 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); -static bool rcu_nohz_full_cpu(struct rcu_state *rsp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c849bd4..511e6b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -24,6 +24,12 @@ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/delay.h> +#include <linux/gfp.h> +#include <linux/oom.h> +#include <linux/smpboot.h> +#include "time/tick-internal.h" + #define RCU_KTHREAD_PRIO 1 #ifdef CONFIG_RCU_BOOST @@ -353,7 +359,7 @@ void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block. */ - if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -650,6 +656,15 @@ static void rcu_preempt_check_callbacks(int cpu) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } +#ifdef CONFIG_RCU_BOOST + +static void rcu_preempt_do_callbacks(void) +{ + rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * Queue a preemptible-RCU callback for invocation after a grace period. */ @@ -1111,19 +1126,6 @@ void exit_rcu(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -/* - * If boosting, set rcuc kthreads to realtime priority. - */ -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ -#ifdef CONFIG_RCU_BOOST - struct sched_param sp; - - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - #ifdef CONFIG_RCU_BOOST #include "rtmutex_common.h" @@ -1155,6 +1157,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1298,6 +1310,23 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } /* + * Wake up the per-CPU kthread to invoke RCU callbacks. + */ +static void invoke_rcu_callbacks_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } + local_irq_restore(flags); +} + +/* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. */ @@ -1351,6 +1380,67 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } +static void rcu_kthread_do_work(void) +{ + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_do_callbacks(); +} + +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ + struct sched_param sp; + + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __get_cpu_var(rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * RCU softirq used in flavors and configurations of RCU that do not + * support RCU priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_kthread_do_work(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } + } + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; +} + /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1384,14 +1474,27 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; + for_each_possible_cpu(cpu) + per_cpu(rcu_cpu_has_work, cpu) = 0; + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { @@ -1419,6 +1522,11 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +static void invoke_rcu_callbacks_kthread(void) +{ + WARN_ON_ONCE(1); +} + static bool rcu_is_callbacks_kthread(void) { return false; @@ -1445,7 +1553,7 @@ static void rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) +#if !defined(CONFIG_RCU_FAST_NO_HZ) /* * Check to see if any future RCU-related work will need to be done @@ -1461,9 +1569,6 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu, NULL); } -#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */ - -#if !defined(CONFIG_RCU_FAST_NO_HZ) /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1561,8 +1666,6 @@ static bool rcu_try_advance_all_cbs(void) return cbs_ready; } -#ifndef CONFIG_PREEMPT_RT_FULL - /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the @@ -1601,7 +1704,6 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) } return 0; } -#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task @@ -1959,7 +2061,7 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp) */ static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); } /* @@ -1977,8 +2079,8 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_swait_head(&rnp->nocb_gp_wq[0]); - init_swait_head(&rnp->nocb_gp_wq[1]); + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); } /* Is the specified CPU a no-CPUs CPU? */ @@ -2018,7 +2120,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, return; len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { - swait_wake(&rdp->nocb_wq); /* ... only if queue was empty ... */ + wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ @@ -2108,7 +2210,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - swait_event_interruptible( + wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); if (likely(d)) @@ -2136,7 +2238,7 @@ static int rcu_nocb_kthread(void *arg) for (;;) { /* If not polling, wait for next batch of callbacks. */ if (!rcu_nocb_poll) - swait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); list = ACCESS_ONCE(rdp->nocb_head); if (!list) { schedule_timeout_interruptible(1); @@ -2186,7 +2288,7 @@ static int rcu_nocb_kthread(void *arg) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_swait_head(&rdp->nocb_wq); + init_waitqueue_head(&rdp->nocb_wq); } /* Create a kthread for each RCU flavor for each no-CBs CPU. */ @@ -2701,23 +2803,3 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -/* - * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the - * grace-period kthread will do force_quiescent_state() processing? - * The idea is to avoid waking up RCU core processing on such a - * CPU unless the grace period has extended for too long. - * - * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPUs. - */ -static bool rcu_nohz_full_cpu(struct rcu_state *rsp) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(smp_processor_id()) && - (!rcu_gp_in_progress(rsp) || - ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) - return 1; -#endif /* #ifdef CONFIG_NO_HZ_FULL */ - return 0; -} diff --git a/kernel/relay.c b/kernel/relay.c index b915513..5001c98 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -339,10 +339,6 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); - /* - * Stupid polling for now: - */ - mod_timer(&buf->timer, jiffies + 1); } /** @@ -360,7 +356,6 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -744,6 +739,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; + smp_mb(); + if (waitqueue_active(&buf->read_wait)) + /* + * Calling wake_up_interruptible() from here + * will deadlock if we happen to be logging + * from the scheduler (trying to re-grab + * rq->lock), so defer it. + */ + mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 3fbcb0d..4aa8a30 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -49,7 +49,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val, r = ret = 0; *limit_fail_at = NULL; - local_irq_save_nort(flags); + local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); r = res_counter_charge_locked(c, val, force); @@ -69,7 +69,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val, spin_unlock(&u->lock); } } - local_irq_restore_nort(flags); + local_irq_restore(flags); return ret; } @@ -103,7 +103,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *c; u64 ret = 0; - local_irq_save_nort(flags); + local_irq_save(flags); for (c = counter; c != top; c = c->parent) { u64 r; spin_lock(&c->lock); @@ -112,7 +112,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter, ret = r; spin_unlock(&c->lock); } - local_irq_restore_nort(flags); + local_irq_restore(flags); return ret; } diff --git a/kernel/rt.c b/kernel/rt.c deleted file mode 100644 index 5d17727..0000000 --- a/kernel/rt.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * kernel/rt.c - * - * Real-Time Preemption Support - * - * started by Ingo Molnar: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * historic credit for proving that Linux spinlocks can be implemented via - * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow - * and others) who prototyped it on 2.4 and did lots of comparative - * research and analysis; TimeSys, for proving that you can implement a - * fully preemptible kernel via the use of IRQ threading and mutexes; - * Bill Huey for persuasively arguing on lkml that the mutex model is the - * right one; and to MontaVista, who ported pmutexes to 2.6. - * - * This code is a from-scratch implementation and is not based on pmutexes, - * but the idea of converting spinlocks to mutexes is used here too. - * - * lock debugging, locking tree, deadlock detection: - * - * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Released under the General Public License (GPL). - * - * Includes portions of the generic R/W semaphore implementation from: - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - * - * Pending ownership of locks and ownership stealing: - * - * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt - * - * (also by Steven Rostedt) - * - Converted single pi_lock to individual task locks. - * - * By Esben Nielsen: - * Doing priority inheritance with help of the scheduler. - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - major rework based on Esben Nielsens initial patch - * - replaced thread_info references by task_struct refs - * - removed task->pending_owner dependency - * - BKL drop/reacquire for semaphore style locks to avoid deadlocks - * in the scheduler return path as discussed with Steven Rostedt - * - * Copyright (C) 2006, Kihon Technologies Inc. - * Steven Rostedt <rostedt@goodmis.org> - * - debugged and patched Thomas Gleixner's rework. - * - added back the cmpxchg to the rework. - * - turned atomic require back on for SMP. - */ - -#include <linux/spinlock.h> -#include <linux/rtmutex.h> -#include <linux/sched.h> -#include <linux/delay.h> -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/syscalls.h> -#include <linux/interrupt.h> -#include <linux/plist.h> -#include <linux/fs.h> -#include <linux/futex.h> -#include <linux/hrtimer.h> - -#include "rtmutex_common.h" - -/* - * struct mutex functions - */ -void __mutex_do_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map(&mutex->dep_map, name, key, 0); -#endif - mutex->lock.save_state = 0; -} -EXPORT_SYMBOL(__mutex_do_init); - -void __lockfunc _mutex_lock(struct mutex *lock) -{ - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_lock(&lock->lock); -} -EXPORT_SYMBOL(_mutex_lock); - -int __lockfunc _mutex_lock_interruptible(struct mutex *lock) -{ - int ret; - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_lock_interruptible(&lock->lock, 0); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(_mutex_lock_interruptible); - -int __lockfunc _mutex_lock_killable(struct mutex *lock) -{ - int ret; - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_lock_killable(&lock->lock, 0); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(_mutex_lock_killable); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) -{ - mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); - rt_mutex_lock(&lock->lock); -} -EXPORT_SYMBOL(_mutex_lock_nested); - -void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) -{ - mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); - rt_mutex_lock(&lock->lock); -} -EXPORT_SYMBOL(_mutex_lock_nest_lock); - -int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) -{ - int ret; - - mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); - ret = rt_mutex_lock_interruptible(&lock->lock, 0); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(_mutex_lock_interruptible_nested); - -int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) -{ - int ret; - - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - ret = rt_mutex_lock_killable(&lock->lock, 0); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(_mutex_lock_killable_nested); -#endif - -int __lockfunc _mutex_trylock(struct mutex *lock) -{ - int ret = rt_mutex_trylock(&lock->lock); - - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; -} -EXPORT_SYMBOL(_mutex_trylock); - -void __lockfunc _mutex_unlock(struct mutex *lock) -{ - mutex_release(&lock->dep_map, 1, _RET_IP_); - rt_mutex_unlock(&lock->lock); -} -EXPORT_SYMBOL(_mutex_unlock); - -/* - * rwlock_t functions - */ -int __lockfunc rt_write_trylock(rwlock_t *rwlock) -{ - int ret = rt_mutex_trylock(&rwlock->lock); - - if (ret) { - rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); - migrate_disable(); - } - - return ret; -} -EXPORT_SYMBOL(rt_write_trylock); - -int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) -{ - int ret; - - *flags = 0; - ret = rt_write_trylock(rwlock); - return ret; -} -EXPORT_SYMBOL(rt_write_trylock_irqsave); - -int __lockfunc rt_read_trylock(rwlock_t *rwlock) -{ - struct rt_mutex *lock = &rwlock->lock; - int ret = 1; - - /* - * recursive read locks succeed when current owns the lock, - * but not when read_depth == 0 which means that the lock is - * write locked. - */ - if (rt_mutex_owner(lock) != current) { - ret = rt_mutex_trylock(lock); - if (ret) { - rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); - migrate_disable(); - } - } else if (!rwlock->read_depth) { - ret = 0; - } - - if (ret) - rwlock->read_depth++; - - return ret; -} -EXPORT_SYMBOL(rt_read_trylock); - -void __lockfunc rt_write_lock(rwlock_t *rwlock) -{ - rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - migrate_disable(); - __rt_spin_lock(&rwlock->lock); -} -EXPORT_SYMBOL(rt_write_lock); - -void __lockfunc rt_read_lock(rwlock_t *rwlock) -{ - struct rt_mutex *lock = &rwlock->lock; - - /* - * recursive read locks succeed when current owns the lock - */ - if (rt_mutex_owner(lock) != current) { - rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - __rt_spin_lock(lock); - migrate_disable(); - } - rwlock->read_depth++; -} - -EXPORT_SYMBOL(rt_read_lock); - -void __lockfunc rt_write_unlock(rwlock_t *rwlock) -{ - /* NOTE: we always pass in '1' for nested, for simplicity */ - rwlock_release(&rwlock->dep_map, 1, _RET_IP_); - __rt_spin_unlock(&rwlock->lock); - migrate_enable(); -} -EXPORT_SYMBOL(rt_write_unlock); - -void __lockfunc rt_read_unlock(rwlock_t *rwlock) -{ - /* Release the lock only when read_depth is down to 0 */ - if (--rwlock->read_depth == 0) { - rwlock_release(&rwlock->dep_map, 1, _RET_IP_); - __rt_spin_unlock(&rwlock->lock); - migrate_enable(); - } -} -EXPORT_SYMBOL(rt_read_unlock); - -unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) -{ - rt_write_lock(rwlock); - - return 0; -} -EXPORT_SYMBOL(rt_write_lock_irqsave); - -unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) -{ - rt_read_lock(rwlock); - - return 0; -} -EXPORT_SYMBOL(rt_read_lock_irqsave); - -void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); - lockdep_init_map(&rwlock->dep_map, name, key, 0); -#endif - rwlock->lock.save_state = 1; - rwlock->read_depth = 0; -} -EXPORT_SYMBOL(__rt_rwlock_init); - -/* - * rw_semaphores - */ - -void rt_up_write(struct rw_semaphore *rwsem) -{ - rwsem_release(&rwsem->dep_map, 1, _RET_IP_); - rt_mutex_unlock(&rwsem->lock); -} -EXPORT_SYMBOL(rt_up_write); - -void rt_up_read(struct rw_semaphore *rwsem) -{ - if (--rwsem->read_depth == 0) { - rwsem_release(&rwsem->dep_map, 1, _RET_IP_); - rt_mutex_unlock(&rwsem->lock); - } -} -EXPORT_SYMBOL(rt_up_read); - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void rt_downgrade_write(struct rw_semaphore *rwsem) -{ - BUG_ON(rt_mutex_owner(&rwsem->lock) != current); - rwsem->read_depth = 1; -} -EXPORT_SYMBOL(rt_downgrade_write); - -int rt_down_write_trylock(struct rw_semaphore *rwsem) -{ - int ret = rt_mutex_trylock(&rwsem->lock); - - if (ret) - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(rt_down_write_trylock); - -void rt_down_write(struct rw_semaphore *rwsem) -{ - rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); -} -EXPORT_SYMBOL(rt_down_write); - -void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) -{ - rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); -} -EXPORT_SYMBOL(rt_down_write_nested); - -void rt_down_write_nested_lock(struct rw_semaphore *rwsem, - struct lockdep_map *nest) -{ - rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_); - rt_mutex_lock(&rwsem->lock); -} - -int rt_down_read_trylock(struct rw_semaphore *rwsem) -{ - struct rt_mutex *lock = &rwsem->lock; - int ret = 1; - - /* - * recursive read locks succeed when current owns the rwsem, - * but not when read_depth == 0 which means that the rwsem is - * write locked. - */ - if (rt_mutex_owner(lock) != current) { - ret = rt_mutex_trylock(&rwsem->lock); - if (ret) - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); - } else if (!rwsem->read_depth) { - ret = 0; - } - - if (ret) - rwsem->read_depth++; - return ret; -} -EXPORT_SYMBOL(rt_down_read_trylock); - -static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) -{ - struct rt_mutex *lock = &rwsem->lock; - - if (rt_mutex_owner(lock) != current) { - rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); - } - rwsem->read_depth++; -} - -void rt_down_read(struct rw_semaphore *rwsem) -{ - __rt_down_read(rwsem, 0); -} -EXPORT_SYMBOL(rt_down_read); - -void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) -{ - __rt_down_read(rwsem, subclass); -} -EXPORT_SYMBOL(rt_down_read_nested); - -void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); - lockdep_init_map(&rwsem->dep_map, name, key, 0); -#endif - rwsem->read_depth = 0; - rwsem->lock.save_state = 0; -} -EXPORT_SYMBOL(__rt_rwsem_init); - -/** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec - * @lock: the mutex to return holding if we dec to 0 - * - * return true and hold lock if we dec to 0, return false otherwise - */ -int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) -{ - /* dec if we can't possibly hit 0 */ - if (atomic_add_unless(cnt, -1, 1)) - return 0; - /* we might hit 0, so take the lock */ - mutex_lock(lock); - if (!atomic_dec_and_test(cnt)) { - /* when we actually did the dec, we didn't hit 0 */ - mutex_unlock(lock); - return 0; - } - /* we hit 0, and we hold the lock */ - return 1; -} -EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4057bc6..0dd6aec 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -8,12 +8,6 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * Adaptive Spinlocks: - * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, - * and Peter Morreale, - * Adaptive Spinlocks simplification: - * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> - * * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> @@ -21,7 +15,6 @@ #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/timer.h> -#include <linux/ww_mutex.h> #include "rtmutex_common.h" @@ -75,12 +68,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) clear_rt_mutex_waiters(lock); } -static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) -{ - return waiter && waiter != PI_WAKEUP_INPROGRESS && - waiter != PI_REQUEUE_INPROGRESS; -} - /* * We can speed up the acquire/release, if the architecture * supports cmpxchg and if there's no debugging state to be set up @@ -104,12 +91,6 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif -static inline void init_lists(struct rt_mutex *lock) -{ - if (unlikely(!lock->wait_list.node_list.prev)) - plist_head_init(&lock->wait_list); -} - /* * Calculate task priority from the waiter list priority * @@ -126,18 +107,6 @@ int rt_mutex_getprio(struct task_struct *task) } /* - * Called by sched_setscheduler() to check whether the priority change - * is overruled by a possible priority boosting. - */ -int rt_mutex_check_prio(struct task_struct *task, int newprio) -{ - if (!task_has_pi_waiters(task)) - return 0; - - return task_top_pi_waiter(task)->pi_list_entry.prio <= newprio; -} - -/* * Adjust the priority of a task, after its pi_waiters got modified. * * This can be both boosting and unboosting. task->pi_lock must be held. @@ -168,14 +137,6 @@ static void rt_mutex_adjust_prio(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); } -static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter) -{ - if (waiter->savestate) - wake_up_lock_sleeper(waiter->task); - else - wake_up_process(waiter->task); -} - /* * Max number of times we'll walk the boosting chain: */ @@ -249,7 +210,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!rt_mutex_real_waiter(waiter)) + if (!waiter) goto out_unlock_pi; /* @@ -300,15 +261,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); if (!rt_mutex_owner(lock)) { - struct rt_mutex_waiter *lock_top_waiter; - /* * If the requeue above changed the top waiter, then we need * to wake the new top waiter up to try to get the lock. */ - lock_top_waiter = rt_mutex_top_waiter(lock); - if (top_waiter != lock_top_waiter) - rt_mutex_wake_waiter(lock_top_waiter); + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); raw_spin_unlock(&lock->wait_lock); goto out_put_task; } @@ -353,25 +312,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return ret; } - -#define STEAL_NORMAL 0 -#define STEAL_LATERAL 1 - -/* - * Note that RT tasks are excluded from lateral-steals to prevent the - * introduction of an unbounded latency - */ -static inline int lock_is_stealable(struct task_struct *task, - struct task_struct *pendowner, int mode) -{ - if (mode == STEAL_NORMAL || rt_task(task)) { - if (task->prio >= pendowner->prio) - return 0; - } else if (task->prio > pendowner->prio) - return 0; - return 1; -} - /* * Try to take an rt-mutex * @@ -381,9 +321,8 @@ static inline int lock_is_stealable(struct task_struct *task, * @task: the task which wants to acquire the lock * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int -__try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter, int mode) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -416,14 +355,12 @@ __try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - struct task_struct *pown = rt_mutex_top_waiter(lock)->task; - - if (task != pown && !lock_is_stealable(task, pown, mode)) - return 0; + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } } - /* We got the lock. */ - if (waiter || rt_mutex_has_waiters(lock)) { unsigned long flags; struct rt_mutex_waiter *top; @@ -448,6 +385,7 @@ __try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, raw_spin_unlock_irqrestore(&task->pi_lock, flags); } + /* We got the lock. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task); @@ -457,13 +395,6 @@ __try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, return 1; } -static inline int -try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) -{ - return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL); -} - /* * Task blocks on lock. * @@ -482,23 +413,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, int chain_walk = 0, res; raw_spin_lock_irqsave(&task->pi_lock, flags); - - /* - * In the case of futex requeue PI, this will be a proxy - * lock. The task will wake unaware that it is enqueueed on - * this lock. Avoid blocking on two locks and corrupting - * pi_blocked_on via the PI_WAKEUP_INPROGRESS - * flag. futex_wait_requeue_pi() sets this when it wakes up - * before requeue (due to a signal or timeout). Do not enqueue - * the task if PI_WAKEUP_INPROGRESS is set. - */ - if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return -EAGAIN; - } - - BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); - __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -523,7 +437,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); - if (rt_mutex_real_waiter(owner->pi_blocked_on)) + if (owner->pi_blocked_on) chain_walk = 1; raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } @@ -578,7 +492,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - rt_mutex_wake_waiter(waiter); + wake_up_process(waiter->task); } /* @@ -617,7 +531,7 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (rt_mutex_real_waiter(owner->pi_blocked_on)) + if (owner->pi_blocked_on) chain_walk = 1; raw_spin_unlock_irqrestore(&owner->pi_lock, flags); @@ -651,371 +565,23 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!rt_mutex_real_waiter(waiter) || - waiter->list_entry.prio == task->prio) { + if (!waiter || waiter->list_entry.prio == task->prio) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * preemptible spin_lock functions: - */ -static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) -{ - might_sleep(); - - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) - rt_mutex_deadlock_account_lock(lock, current); - else - slowfn(lock); -} - -static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) - rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); -} - -#ifdef CONFIG_SMP -/* - * Note that owner is a speculative pointer and dereferencing relies - * on rcu_read_lock() and the check against the lock owner. - */ -static int adaptive_wait(struct rt_mutex *lock, - struct task_struct *owner) -{ - int res = 0; - - rcu_read_lock(); - for (;;) { - if (owner != rt_mutex_owner(lock)) - break; - /* - * Ensure that owner->on_cpu is dereferenced _after_ - * checking the above to be valid. - */ - barrier(); - if (!owner->on_cpu) { - res = 1; - break; - } - cpu_relax(); - } - rcu_read_unlock(); - return res; -} -#else -static int adaptive_wait(struct rt_mutex *lock, - struct task_struct *orig_owner) -{ - return 1; -} -#endif - -# define pi_lock(lock) raw_spin_lock_irq(lock) -# define pi_unlock(lock) raw_spin_unlock_irq(lock) - -/* - * Slow path lock function spin_lock style: this variant is very - * careful not to miss any non-lock wakeups. - * - * We store the current state under p->pi_lock in p->saved_state and - * the try_to_wake_up() code handles this accordingly. - */ -static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) -{ - struct task_struct *lock_owner, *self = current; - struct rt_mutex_waiter waiter, *top_waiter; - int ret; - - rt_mutex_init_waiter(&waiter, true); - - raw_spin_lock(&lock->wait_lock); - init_lists(lock); - - if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) { - raw_spin_unlock(&lock->wait_lock); - return; - } - - BUG_ON(rt_mutex_owner(lock) == self); - - /* - * We save whatever state the task is in and we'll restore it - * after acquiring the lock taking real wakeups into account - * as well. We are serialized via pi_lock against wakeups. See - * try_to_wake_up(). - */ - pi_lock(&self->pi_lock); - self->saved_state = self->state; - __set_current_state(TASK_UNINTERRUPTIBLE); - pi_unlock(&self->pi_lock); - - ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0); - BUG_ON(ret); - - for (;;) { - /* Try to acquire the lock again. */ - if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL)) - break; - - top_waiter = rt_mutex_top_waiter(lock); - lock_owner = rt_mutex_owner(lock); - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(&waiter); - - if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) - schedule_rt_mutex(lock); - - raw_spin_lock(&lock->wait_lock); - - pi_lock(&self->pi_lock); - __set_current_state(TASK_UNINTERRUPTIBLE); - pi_unlock(&self->pi_lock); - } - - /* - * Restore the task state to current->saved_state. We set it - * to the original state above and the try_to_wake_up() code - * has possibly updated it when a real (non-rtmutex) wakeup - * happened while we were blocked. Clear saved_state so - * try_to_wakeup() does not get confused. - */ - pi_lock(&self->pi_lock); - __set_current_state(self->saved_state); - self->saved_state = TASK_RUNNING; - pi_unlock(&self->pi_lock); - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up: - */ - fixup_rt_mutex_waiters(lock); - - BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock)); - BUG_ON(!plist_node_empty(&waiter.list_entry)); - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_free_waiter(&waiter); -} - -/* - * Slow path to release a rt_mutex spin_lock style - */ -static void __sched __rt_spin_lock_slowunlock(struct rt_mutex *lock) -{ - debug_rt_mutex_unlock(lock); - - rt_mutex_deadlock_account_unlock(current); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; - } - - wakeup_next_waiter(lock); - - raw_spin_unlock(&lock->wait_lock); - - /* Undo pi boosting.when necessary */ - rt_mutex_adjust_prio(current); -} - -static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) -{ - raw_spin_lock(&lock->wait_lock); - __rt_spin_lock_slowunlock(lock); -} - -static void noinline __sched rt_spin_lock_slowunlock_hirq(struct rt_mutex *lock) -{ - int ret; - - do { - ret = raw_spin_trylock(&lock->wait_lock); - } while (!ret); - - __rt_spin_lock_slowunlock(lock); -} - -void __lockfunc rt_spin_lock(spinlock_t *lock) -{ - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); -} -EXPORT_SYMBOL(rt_spin_lock); - -void __lockfunc __rt_spin_lock(struct rt_mutex *lock) -{ - rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); -} -EXPORT_SYMBOL(__rt_spin_lock); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) -{ - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); -} -EXPORT_SYMBOL(rt_spin_lock_nested); -#endif - -void __lockfunc rt_spin_unlock(spinlock_t *lock) -{ - /* NOTE: we always pass in '1' for nested, for simplicity */ - spin_release(&lock->dep_map, 1, _RET_IP_); - rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); -} -EXPORT_SYMBOL(rt_spin_unlock); - -void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock) -{ - /* NOTE: we always pass in '1' for nested, for simplicity */ - spin_release(&lock->dep_map, 1, _RET_IP_); - rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_hirq); -} - -void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) -{ - rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); -} -EXPORT_SYMBOL(__rt_spin_unlock); - -/* - * Wait for the lock to get unlocked: instead of polling for an unlock - * (like raw spinlocks do), we lock and unlock, to force the kernel to - * schedule if there's contention: - */ -void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) -{ - spin_lock(lock); - spin_unlock(lock); -} -EXPORT_SYMBOL(rt_spin_unlock_wait); - -int __lockfunc rt_spin_trylock(spinlock_t *lock) -{ - int ret = rt_mutex_trylock(&lock->lock); - - if (ret) - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return ret; -} -EXPORT_SYMBOL(rt_spin_trylock); - -int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) -{ - int ret; - - local_bh_disable(); - ret = rt_mutex_trylock(&lock->lock); - if (ret) { - migrate_disable(); - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } else - local_bh_enable(); - return ret; -} -EXPORT_SYMBOL(rt_spin_trylock_bh); - -int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) -{ - int ret; - - *flags = 0; - ret = rt_mutex_trylock(&lock->lock); - if (ret) { - migrate_disable(); - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - return ret; -} -EXPORT_SYMBOL(rt_spin_trylock_irqsave); - -int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) -{ - /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ - if (atomic_add_unless(atomic, -1, 1)) - return 0; - rt_spin_lock(lock); - if (atomic_dec_and_test(atomic)){ - migrate_disable(); - return 1; - } - rt_spin_unlock(lock); - return 0; -} -EXPORT_SYMBOL(atomic_dec_and_spin_lock); - -void -__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} -EXPORT_SYMBOL(__rt_spin_lock_init); - -#endif /* PREEMPT_RT_FULL */ - -#ifdef CONFIG_PREEMPT_RT_FULL -static inline int __sched -__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); - struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); - - if (!hold_ctx) - return 0; - - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && - (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - return 0; -} -#else -static inline int __sched -__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) -{ - BUG(); - return 0; -} - -#endif - /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * @@ -1024,8 +590,7 @@ __mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - struct ww_acquire_ctx *ww_ctx) + struct rt_mutex_waiter *waiter) { int ret = 0; @@ -1048,12 +613,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - if (ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); - if (ret) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1067,102 +626,23 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -#ifdef CONFIG_PREEMPT_RT_FULL -static void ww_mutex_account_lock(struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); - struct rt_mutex_waiter *waiter; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - plist_for_each_entry(waiter, &lock->wait_list, list_entry) { - - /* XXX debug rt mutex waiter wakeup */ - - BUG_ON(waiter->lock != lock); - rt_mutex_wake_waiter(waiter); - } -} - -#else - -static void ww_mutex_account_lock(struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - BUG(); -} -#endif - /* * Slow path lock function: */ static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock, struct ww_acquire_ctx *ww_ctx) + int detect_deadlock) { struct rt_mutex_waiter waiter; int ret = 0; - rt_mutex_init_waiter(&waiter, false); + debug_rt_mutex_init_waiter(&waiter); raw_spin_lock(&lock->wait_lock); - init_lists(lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - if (ww_ctx) - ww_mutex_account_lock(lock, ww_ctx); raw_spin_unlock(&lock->wait_lock); return 0; } @@ -1179,14 +659,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); if (likely(!ret)) - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ww_ctx); + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); if (unlikely(ret)) remove_waiter(lock, &waiter); - else if (ww_ctx) - ww_mutex_account_lock(lock, ww_ctx); /* * try_to_take_rt_mutex() sets the waiter bit @@ -1213,9 +691,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = 0; - if (!raw_spin_trylock(&lock->wait_lock)) - return ret; - init_lists(lock); + raw_spin_lock(&lock->wait_lock); if (likely(rt_mutex_owner(lock) != current)) { @@ -1266,33 +742,30 @@ rt_mutex_slowunlock(struct rt_mutex *lock) */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, struct ww_acquire_ctx *ww_ctx, + int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock, - struct ww_acquire_ctx *ww_ctx)) + int detect_deadlock)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, NULL, detect_deadlock, ww_ctx); + return slowfn(lock, state, NULL, detect_deadlock); } static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock, - struct ww_acquire_ctx *ww_ctx, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock, - struct ww_acquire_ctx *ww_ctx)) + int detect_deadlock)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, timeout, detect_deadlock, ww_ctx); + return slowfn(lock, state, timeout, detect_deadlock); } static inline int @@ -1325,19 +798,19 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, rt_mutex_slowlock); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @detect_deadlock: deadlock detection on/off * * Returns: - * 0 on success - * -EINTR when interrupted by a signal + * 0 on success + * -EINTR when interrupted by a signal * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, @@ -1346,43 +819,22 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, might_sleep(); return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, NULL, rt_mutex_slowlock); + detect_deadlock, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_killable - lock a rt_mutex killable - * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int __sched rt_mutex_lock_killable(struct rt_mutex *lock, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_fastlock(lock, TASK_KILLABLE, - detect_deadlock, NULL, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); - -/** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided * by the caller * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) * @detect_deadlock: deadlock detection on/off * * Returns: - * 0 on success - * -EINTR when interrupted by a signal + * 0 on success + * -EINTR when interrupted by a signal * -ETIMEDOUT when the timeout expired * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ @@ -1393,7 +845,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, might_sleep(); return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, NULL, rt_mutex_slowlock); + detect_deadlock, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1451,11 +903,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; + raw_spin_lock_init(&lock->wait_lock); plist_head_init(&lock->wait_list); debug_rt_mutex_init(lock, name); } -EXPORT_SYMBOL(__rt_mutex_init); +EXPORT_SYMBOL_GPL(__rt_mutex_init); /** * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a @@ -1470,7 +923,7 @@ EXPORT_SYMBOL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - rt_mutex_init(lock); + __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); @@ -1519,35 +972,6 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } -#ifdef CONFIG_PREEMPT_RT_FULL - /* - * In PREEMPT_RT there's an added race. - * If the task, that we are about to requeue, times out, - * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue - * to skip this task. But right after the task sets - * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then - * block on the spin_lock(&hb->lock), which in RT is an rtmutex. - * This will replace the PI_WAKEUP_INPROGRESS with the actual - * lock that it blocks on. We *must not* place this task - * on this proxy lock in that case. - * - * To prevent this race, we first take the task's pi_lock - * and check if it has updated its pi_blocked_on. If it has, - * we assume that it woke up and we return -EAGAIN. - * Otherwise, we set the task's pi_blocked_on to - * PI_REQUEUE_INPROGRESS, so that if the task is waking up - * it will know that we are in the process of requeuing it. - */ - raw_spin_lock_irq(&task->pi_lock); - if (task->pi_blocked_on) { - raw_spin_unlock_irq(&task->pi_lock); - raw_spin_unlock(&lock->wait_lock); - return -EAGAIN; - } - task->pi_blocked_on = PI_REQUEUE_INPROGRESS; - raw_spin_unlock_irq(&task->pi_lock); -#endif - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); if (ret && !rt_mutex_owner(lock)) { @@ -1617,7 +1041,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); @@ -1634,88 +1058,3 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, return ret; } - -static inline int -ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ -#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH - unsigned tmp; - - if (ctx->deadlock_inject_countdown-- == 0) { - tmp = ctx->deadlock_inject_interval; - if (tmp > UINT_MAX/4) - tmp = UINT_MAX; - else - tmp = tmp*2 + tmp + tmp/2; - - ctx->deadlock_inject_interval = tmp; - ctx->deadlock_inject_countdown = tmp; - ctx->contending_lock = lock; - - ww_mutex_unlock(lock); - - return -EDEADLK; - } -#endif - - return 0; -} - -#ifdef CONFIG_PREEMPT_RT_FULL -int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - int ret; - - might_sleep(); - - mutex_acquire(&lock->base.dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx); - if (ret) - mutex_release(&lock->base.dep_map, 1, _RET_IP_); - else if (!ret && ww_ctx->acquired > 1) - return ww_mutex_deadlock_injection(lock, ww_ctx); - - return ret; -} -EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); - -int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - int ret; - - might_sleep(); - - mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, - _RET_IP_); - ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx); - if (ret) - mutex_release(&lock->base.dep_map, 1, _RET_IP_); - else if (!ret && ww_ctx->acquired > 1) - return ww_mutex_deadlock_injection(lock, ww_ctx); - - return ret; -} -EXPORT_SYMBOL_GPL(__ww_mutex_lock); - -void __sched ww_mutex_unlock(struct ww_mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - - mutex_release(&lock->base.dep_map, 1, _RET_IP_); - rt_mutex_unlock(&lock->base.lock); -} -EXPORT_SYMBOL(ww_mutex_unlock); -#endif diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 6ec3dc1..53a66c8 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -49,7 +49,6 @@ struct rt_mutex_waiter { struct plist_node pi_list_entry; struct task_struct *task; struct rt_mutex *lock; - bool savestate; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; @@ -104,9 +103,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) /* * PI-futex support (proxy locking functions, etc.): */ -#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) -#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) - extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); @@ -127,12 +123,4 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, # include "rtmutex.h" #endif -static inline void -rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) -{ - debug_rt_mutex_init_waiter(waiter); - waiter->task = NULL; - waiter->savestate = savestate; -} - #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8749d20..a494ace 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -272,11 +272,7 @@ late_initcall(sched_init_debug); * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -#ifndef CONFIG_PREEMPT_RT_FULL const_debug unsigned int sysctl_sched_nr_migrate = 32; -#else -const_debug unsigned int sysctl_sched_nr_migrate = 8; -#endif /* * period over which we average the RT time consumption, measured @@ -495,7 +491,6 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -540,37 +535,6 @@ void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -#ifdef CONFIG_PREEMPT_LAZY -void resched_task_lazy(struct task_struct *p) -{ - int cpu; - - if (!sched_feat(PREEMPT_LAZY)) { - resched_task(p); - return; - } - - assert_raw_spin_locked(&task_rq(p)->lock); - - if (test_tsk_need_resched(p)) - return; - - if (test_tsk_need_resched_lazy(p)) - return; - - set_tsk_need_resched_lazy(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED_LAZY must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} -#endif - void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -735,17 +699,6 @@ void resched_task(struct task_struct *p) assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } -#ifdef CONFIG_PREEMPT_LAZY -void resched_task_lazy(struct task_struct *p) -{ - if (!sched_feat(PREEMPT_LAZY)) { - resched_task(p); - return; - } - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched_lazy(p); -} -#endif #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -1071,18 +1024,6 @@ struct migration_arg { static int migration_cpu_stop(void *data); -static bool check_task_state(struct task_struct *p, long match_state) -{ - bool match = false; - - raw_spin_lock_irq(&p->pi_lock); - if (p->state == match_state || p->saved_state == match_state) - match = true; - raw_spin_unlock_irq(&p->pi_lock); - - return match; -} - /* * wait_task_inactive - wait for a thread to unschedule. * @@ -1127,7 +1068,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && !check_task_state(p, match_state)) + if (match_state && unlikely(p->state != match_state)) return 0; cpu_relax(); } @@ -1142,8 +1083,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->on_rq; ncsw = 0; - if (!match_state || p->state == match_state - || p->saved_state == match_state) + if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &flags); @@ -1289,12 +1229,6 @@ out: } } - /* - * Clear PF_NO_SETAFFINITY, otherwise we wreckage - * migrate_disable/enable. See optimization for - * PF_NO_SETAFFINITY tasks there. - */ - p->flags &= ~PF_NO_SETAFFINITY; return dest_cpu; } @@ -1374,6 +1308,10 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -1548,27 +1486,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) { - /* - * The task might be running due to a spinlock sleeper - * wakeup. Check the saved state and set it to running - * if the wakeup condition is true. - */ - if (!(wake_flags & WF_LOCK_SLEEPER)) { - if (p->saved_state & state) { - p->saved_state = TASK_RUNNING; - success = 1; - } - } + if (!(p->state & state)) goto out; - } - - /* - * If this is a regular wakeup, then we can unconditionally - * clear the saved state of a "lock sleeper". - */ - if (!(wake_flags & WF_LOCK_SLEEPER)) - p->saved_state = TASK_RUNNING; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1611,6 +1530,42 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + + if (!(p->state & TASK_NORMAL)) + goto out; + + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -1624,23 +1579,11 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(__task_is_stopped_or_traced(p)); + WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); -/** - * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" - * @p: The process to be woken up. - * - * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate - * the nature of the wakeup. - */ -int wake_up_lock_sleeper(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); -} - int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -1778,9 +1721,6 @@ void sched_fork(struct task_struct *p) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -#ifdef CONFIG_HAVE_PREEMPT_LAZY - task_thread_info(p)->preempt_lazy_count = 0; -#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif @@ -1947,12 +1887,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); - /* - * We use mmdrop_delayed() here so we don't have to do the - * full __mmdrop() when we are the last user. - */ if (mm) - mmdrop_delayed(mm); + mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -2296,13 +2232,8 @@ void __kprobes add_preempt_count(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); @@ -2345,13 +2276,6 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -2375,133 +2299,6 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) -#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */ -#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN) -#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN) - -static inline void update_migrate_disable(struct task_struct *p) -{ - const struct cpumask *mask; - - if (likely(!p->migrate_disable)) - return; - - /* Did we already update affinity? */ - if (unlikely(migrate_disabled_updated(p))) - return; - - /* - * Since this is always current we can get away with only locking - * rq->lock, the ->cpus_allowed value can normally only be changed - * while holding both p->pi_lock and rq->lock, but seeing that this - * is current, we cannot actually be waking up, so all code that - * relies on serialization against p->pi_lock is out of scope. - * - * Having rq->lock serializes us against things like - * set_cpus_allowed_ptr() that can still happen concurrently. - */ - mask = tsk_cpus_allowed(p); - - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */ - p->nr_cpus_allowed = 1; - - /* Let migrate_enable know to fix things back up */ - p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN; -} - -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (in_atomic()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic++; -#endif - return; - } - -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif - - if (p->migrate_disable) { - p->migrate_disable++; - return; - } - - preempt_disable(); - preempt_lazy_disable(); - pin_current_cpu(); - p->migrate_disable = 1; - preempt_enable(); -} -EXPORT_SYMBOL(migrate_disable); - -void migrate_enable(void) -{ - struct task_struct *p = current; - const struct cpumask *mask; - unsigned long flags; - struct rq *rq; - - if (in_atomic()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic--; -#endif - return; - } - -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif - WARN_ON_ONCE(p->migrate_disable <= 0); - - if (migrate_disable_count(p) > 1) { - p->migrate_disable--; - return; - } - - preempt_disable(); - if (unlikely(migrate_disabled_updated(p))) { - /* - * Undo whatever update_migrate_disable() did, also see there - * about locking. - */ - rq = this_rq(); - raw_spin_lock_irqsave(&rq->lock, flags); - - /* - * Clearing migrate_disable causes tsk_cpus_allowed to - * show the tasks original cpu affinity. - */ - p->migrate_disable = 0; - mask = tsk_cpus_allowed(p); - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - p->nr_cpus_allowed = cpumask_weight(mask); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } else - p->migrate_disable = 0; - - unpin_current_cpu(); - preempt_enable(); - preempt_lazy_enable(); -} -EXPORT_SYMBOL(migrate_enable); -#else -static inline void update_migrate_disable(struct task_struct *p) { } -#define migrate_disabled_updated(p) 0 -#endif - static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->on_rq || rq->skip_clock_update < 0) @@ -2601,8 +2398,6 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); - update_migrate_disable(prev); - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2610,6 +2405,19 @@ need_resched: } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; + + /* + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } } switch_count = &prev->nvcsw; } @@ -2622,7 +2430,6 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); - clear_tsk_need_resched_lazy(prev); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2653,14 +2460,6 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; - - /* - * If a worker went to sleep, notify and ask workqueue whether - * it wants to wake up a task to maintain concurrency. - */ - if (tsk->flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -2669,19 +2468,12 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -static inline void sched_update_worker(struct task_struct *tsk) -{ - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); -} - asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); __schedule(); - sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -2727,26 +2519,9 @@ asmlinkage void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; -#ifdef CONFIG_PREEMPT_LAZY - /* - * Check for lazy preemption - */ - if (current_thread_info()->preempt_lazy_count && - !test_thread_flag(TIF_NEED_RESCHED)) - return; -#endif do { add_preempt_count_notrace(PREEMPT_ACTIVE); - /* - * The add/subtract must not be traced by the function - * tracer. But we still want to account for the - * preempt off latency tracer. Since the _notrace versions - * of add/subtract skip the accounting for latency tracer - * we must force it manually. - */ - start_critical_timings(); __schedule(); - stop_critical_timings(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -2919,10 +2694,10 @@ void complete(struct completion *x) { unsigned long flags; - raw_spin_lock_irqsave(&x->wait.lock, flags); + spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __swait_wake_locked(&x->wait, TASK_NORMAL, 1); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -2939,10 +2714,10 @@ void complete_all(struct completion *x) { unsigned long flags; - raw_spin_lock_irqsave(&x->wait.lock, flags); + spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __swait_wake_locked(&x->wait, TASK_NORMAL, 0); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -2951,20 +2726,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DEFINE_SWAITER(wait); + DECLARE_WAITQUEUE(wait, current); - swait_prepare_locked(&x->wait, &wait); + __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); - raw_spin_unlock_irq(&x->wait.lock); + spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - raw_spin_lock_irq(&x->wait.lock); + spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - swait_finish_locked(&x->wait, &wait); + __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } @@ -2978,9 +2753,9 @@ __wait_for_common(struct completion *x, { might_sleep(); - raw_spin_lock_irq(&x->wait.lock); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - raw_spin_unlock_irq(&x->wait.lock); + spin_unlock_irq(&x->wait.lock); return timeout; } @@ -3156,12 +2931,12 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; - raw_spin_lock_irqsave(&x->wait.lock, flags); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -3179,10 +2954,10 @@ bool completion_done(struct completion *x) unsigned long flags; int ret = 1; - raw_spin_lock_irqsave(&x->wait.lock, flags); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); @@ -3243,8 +3018,7 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance - * logic. Call site only calls if the priority of the task changed. + * Used by the rt_mutex code to implement priority inheritance logic. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -3475,25 +3249,20 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -static void __setscheduler_params(struct task_struct *p, int policy, int prio) -{ - p->policy = policy; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - set_load_weight(p); -} - /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - __setscheduler_params(p, policy, prio); + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; + set_load_weight(p); } /* @@ -3515,7 +3284,6 @@ static bool check_same_owner(struct task_struct *p) static int __sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param, bool user) { - int newprio = MAX_RT_PRIO - 1 - param->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; const struct sched_class *prev_class; @@ -3611,13 +3379,10 @@ recheck: } /* - * If not changing anything there's no need to proceed - * further, but store a possible modification of - * reset_on_fork. + * If not changing anything there's no need to proceed further: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { - p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } @@ -3643,25 +3408,6 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } - - p->sched_reset_on_fork = reset_on_fork; - oldprio = p->prio; - - /* - * Special case for priority boosted tasks. - * - * If the new priority is lower or equal (user space view) - * than the current (boosted) priority, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - if (rt_mutex_check_prio(p, newprio)) { - __setscheduler_params(p, policy, param->sched_priority); - task_rq_unlock(rq, p, &flags); - return 0; - } - on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3669,18 +3415,17 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + + oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); - } + if (on_rq) + enqueue_task(rq, p, 0); + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4056,17 +3801,9 @@ static inline int should_resched(void) static void __cond_resched(void) { - do { - add_preempt_count(PREEMPT_ACTIVE); - __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - /* - * Check again in case we missed a preemption - * opportunity between schedule and now. - */ - barrier(); - - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -4107,7 +3844,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -#ifndef CONFIG_PREEMPT_RT_FULL int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -4121,7 +3857,6 @@ int __sched __cond_resched_softirq(void) return 0; } EXPORT_SYMBOL(__cond_resched_softirq); -#endif /** * yield - yield the current processor to other threads. @@ -4471,7 +4206,6 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4479,9 +4213,7 @@ void init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ task_thread_info(idle)->preempt_count = 0; -#ifdef CONFIG_HAVE_PREEMPT_LAZY - task_thread_info(idle)->preempt_lazy_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ @@ -4496,90 +4228,11 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (!migrate_disabled_updated(p)) { - if (p->sched_class && p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); - } - cpumask_copy(&p->cpus_allowed, new_mask); -} - -static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); -static DEFINE_MUTEX(sched_down_mutex); -static cpumask_t sched_down_cpumask; - -void tell_sched_cpu_down_begin(int cpu) -{ - mutex_lock(&sched_down_mutex); - cpumask_set_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); -} - -void tell_sched_cpu_down_done(int cpu) -{ - mutex_lock(&sched_down_mutex); - cpumask_clear_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); -} + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); -/** - * migrate_me - try to move the current task off this cpu - * - * Used by the pin_current_cpu() code to try to get tasks - * to move off the current CPU as it is going down. - * It will only move the task if the task isn't pinned to - * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) - * and the task has to be in a RUNNING state. Otherwise the - * movement of the task will wake it up (change its state - * to running) when the task did not expect it. - * - * Returns 1 if it succeeded in moving the current task - * 0 otherwise. - */ -int migrate_me(void) -{ - struct task_struct *p = current; - struct migration_arg arg; - struct cpumask *cpumask; - struct cpumask *mask; - unsigned long flags; - unsigned int dest_cpu; - struct rq *rq; - - /* - * We can not migrate tasks bounded to a CPU or tasks not - * running. The movement of the task will wake it up. - */ - if (p->flags & PF_NO_SETAFFINITY || p->state) - return 0; - - mutex_lock(&sched_down_mutex); - rq = task_rq_lock(p, &flags); - - cpumask = &__get_cpu_var(sched_cpumasks); - mask = &p->cpus_allowed; - - cpumask_andnot(cpumask, mask, &sched_down_cpumask); - - if (!cpumask_weight(cpumask)) { - /* It's only on this CPU? */ - task_rq_unlock(rq, p, &flags); - mutex_unlock(&sched_down_mutex); - return 0; - } - - dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); - - arg.task = p; - arg.dest_cpu = dest_cpu; - - task_rq_unlock(rq, p, &flags); - - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - mutex_unlock(&sched_down_mutex); - - return 1; + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -4625,7 +4278,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); @@ -4714,8 +4367,6 @@ static int migration_cpu_stop(void *data) #ifdef CONFIG_HOTPLUG_CPU -static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); - /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -4728,12 +4379,7 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); - - /* - * Defer the cleanup to an alive cpu. On RT we can neither - * call mmdrop() nor mmdrop_delayed() from here. - */ - per_cpu(idle_last_mm, smp_processor_id()) = mm; + mmdrop(mm); } /* @@ -5057,10 +4703,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: calc_load_migrate(rq); - if (per_cpu(idle_last_mm, cpu)) { - mmdrop(per_cpu(idle_last_mm, cpu)); - per_cpu(idle_last_mm, cpu) = NULL; - } break; #endif } @@ -6933,8 +6575,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + - sched_rcu_preempt_depth(); + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == preempt_offset); } @@ -6944,8 +6585,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6963,13 +6603,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1681f49..9994791 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -655,45 +655,37 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_gen_account_irq_exit(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_user_enter(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_guest_enter(struct task_struct *tsk) @@ -705,23 +697,19 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_sequnlock(&tsk->vtime_seqlock); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -734,30 +722,24 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { - raw_spin_lock(&prev->vtime_lock); - write_seqcount_begin(&prev->vtime_seq); + write_seqlock(&prev->vtime_seqlock); prev->vtime_snap_whence = VTIME_SLEEPING; - write_seqcount_end(&prev->vtime_seq); - raw_spin_unlock(&prev->vtime_lock); + write_sequnlock(&prev->vtime_seqlock); - raw_spin_lock(¤t->vtime_lock); - write_seqcount_begin(¤t->vtime_seq); + write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; current->vtime_snap = sched_clock_cpu(smp_processor_id()); - write_seqcount_end(¤t->vtime_seq); - raw_spin_unlock(¤t->vtime_lock); + write_sequnlock(¤t->vtime_seqlock); } void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; - raw_spin_lock_irqsave(&t->vtime_lock, flags); - write_seqcount_begin(&t->vtime_seq); + write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; t->vtime_snap = sched_clock_cpu(cpu); - write_seqcount_end(&t->vtime_seq); - raw_spin_unlock_irqrestore(&t->vtime_lock, flags); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } cputime_t task_gtime(struct task_struct *t) @@ -766,13 +748,13 @@ cputime_t task_gtime(struct task_struct *t) cputime_t gtime; do { - seq = read_seqcount_begin(&t->vtime_seq); + seq = read_seqbegin(&t->vtime_seqlock); gtime = t->gtime; if (t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqcount_retry(&t->vtime_seq, seq)); + } while (read_seqretry(&t->vtime_seqlock, seq)); return gtime; } @@ -795,7 +777,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqcount_begin(&t->vtime_seq); + seq = read_seqbegin(&t->vtime_seqlock); if (u_dst) *u_dst = *u_src; @@ -819,7 +801,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqcount_retry(&t->vtime_seq, seq)); + } while (read_seqretry(&t->vtime_seqlock, seq)); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 70812af..fd9ca1d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -256,9 +256,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) P(rt_throttled); PN(rt_time); PN(rt_runtime); -#ifdef CONFIG_SMP - P(rt_nr_migratory); -#endif #undef PN #undef P @@ -588,10 +585,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); -#ifdef CONFIG_PREEMPT_RT_FULL - P(migrate_disable); -#endif - P(nr_cpus_allowed); #undef PN #undef __PN #undef P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0af1448..790e2fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1902,7 +1902,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task_lazy(rq_of(cfs_rq)->curr); + resched_task(rq_of(cfs_rq)->curr); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -1926,7 +1926,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task_lazy(rq_of(cfs_rq)->curr); + resched_task(rq_of(cfs_rq)->curr); } static void @@ -2047,7 +2047,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task_lazy(rq_of(cfs_rq)->curr); + resched_task(rq_of(cfs_rq)->curr); return; } /* @@ -2237,7 +2237,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task_lazy(rq_of(cfs_rq)->curr); + resched_task(rq_of(cfs_rq)->curr); } static __always_inline @@ -2837,7 +2837,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task_lazy(p); + resched_task(p); return; } @@ -3704,7 +3704,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task_lazy(curr); + resched_task(curr); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -5979,7 +5979,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task_lazy(rq->curr); + resched_task(rq->curr); } se->vruntime -= cfs_rq->min_vruntime; @@ -6004,7 +6004,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task_lazy(rq->curr); + resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 4594051..99399f8 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -50,18 +50,11 @@ SCHED_FEAT(LB_BIAS, true) */ SCHED_FEAT(NONTASK_POWER, true) -#ifndef CONFIG_PREEMPT_RT_FULL /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) -#else -SCHED_FEAT(TTWU_QUEUE, false) -# ifdef CONFIG_PREEMPT_LAZY -SCHED_FEAT(PREEMPT_LAZY, true) -# endif -#endif SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 240fc60..ff04e1a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -43,7 +43,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.irqsafe = 1; rt_b->rt_period_timer.function = sched_rt_period_timer; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2843303..4f31059 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -898,7 +898,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ -#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1046,15 +1045,6 @@ extern void init_sched_fair_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); -#ifdef CONFIG_PREEMPT_LAZY -extern void resched_task_lazy(struct task_struct *tsk); -#else -static inline void resched_task_lazy(struct task_struct *tsk) -{ - resched_task(tsk); -} -#endif - extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); diff --git a/kernel/signal.c b/kernel/signal.c index 3d32f54..ded28b9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -14,7 +14,6 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> -#include <linux/sched/rt.h> #include <linux/fs.h> #include <linux/tty.h> #include <linux/binfmts.h> @@ -350,45 +349,13 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } -#ifdef __HAVE_ARCH_CMPXCHG -static inline struct sigqueue *get_task_cache(struct task_struct *t) -{ - struct sigqueue *q = t->sigqueue_cache; - - if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) - return NULL; - return q; -} - -static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) -{ - if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) - return 0; - return 1; -} - -#else - -static inline struct sigqueue *get_task_cache(struct task_struct *t) -{ - return NULL; -} - -static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) -{ - return 1; -} - -#endif - /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * -__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, - int override_rlimit, int fromslab) +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; struct user_struct *user; @@ -405,10 +372,7 @@ __sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, if (override_rlimit || atomic_read(&user->sigpending) <= task_rlimit(t, RLIMIT_SIGPENDING)) { - if (!fromslab) - q = get_task_cache(t); - if (!q) - q = kmem_cache_alloc(sigqueue_cachep, flags); + q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } @@ -425,13 +389,6 @@ __sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, return q; } -static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, - int override_rlimit) -{ - return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); -} - static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) @@ -441,21 +398,6 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } -static void sigqueue_free_current(struct sigqueue *q) -{ - struct user_struct *up; - - if (q->flags & SIGQUEUE_PREALLOC) - return; - - up = q->user; - if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { - atomic_dec(&up->sigpending); - free_uid(up); - } else - __sigqueue_free(q); -} - void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -469,21 +411,6 @@ void flush_sigqueue(struct sigpending *queue) } /* - * Called from __exit_signal. Flush tsk->pending and - * tsk->sigqueue_cache - */ -void flush_task_sigqueue(struct task_struct *tsk) -{ - struct sigqueue *q; - - flush_sigqueue(&tsk->pending); - - q = get_task_cache(tsk); - if (q) - kmem_cache_free(sigqueue_cachep, q); -} - -/* * Flush all pending signals for a task. */ void __flush_signals(struct task_struct *t) @@ -635,7 +562,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); - sigqueue_free_current(first); + __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be @@ -681,8 +608,6 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr; - WARN_ON_ONCE(tsk != current); - /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ @@ -1305,8 +1230,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ -static int -do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t) +int +force_sig_info(int sig, struct siginfo *info, struct task_struct *t) { unsigned long int flags; int ret, blocked, ignored; @@ -1331,39 +1256,6 @@ do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t) return ret; } -int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ -/* - * On some archs, PREEMPT_RT has to delay sending a signal from a trap - * since it can not enable preemption, and the signal code's spin_locks - * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will - * send the signal on exit of the trap. - */ -#ifdef ARCH_RT_DELAYS_SIGNAL_SEND - if (in_atomic()) { - if (WARN_ON_ONCE(t != current)) - return 0; - if (WARN_ON_ONCE(t->forced_info.si_signo)) - return 0; - - if (is_si_special(info)) { - WARN_ON_ONCE(info != SEND_SIG_PRIV); - t->forced_info.si_signo = sig; - t->forced_info.si_errno = 0; - t->forced_info.si_code = SI_KERNEL; - t->forced_info.si_pid = 0; - t->forced_info.si_uid = 0; - } else { - t->forced_info = *info; - } - - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); - return 0; - } -#endif - return do_force_sig_info(sig, info, t); -} - /* * Nuke all other threads in the group. */ @@ -1394,12 +1286,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, struct sighand_struct *sighand; for (;;) { - local_irq_save_nort(*flags); + local_irq_save(*flags); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) { rcu_read_unlock(); - local_irq_restore_nort(*flags); + local_irq_restore(*flags); break; } @@ -1410,7 +1302,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } spin_unlock(&sighand->siglock); rcu_read_unlock(); - local_irq_restore_nort(*flags); + local_irq_restore(*flags); } return sighand; @@ -1655,8 +1547,7 @@ EXPORT_SYMBOL(kill_pid); */ struct sigqueue *sigqueue_alloc(void) { - /* Preallocated sigqueue objects always from the slabcache ! */ - struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); if (q) q->flags |= SIGQUEUE_PREALLOC; @@ -2017,7 +1908,15 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); read_unlock(&tasklist_lock); + preempt_enable_no_resched(); freezable_schedule(); } else { /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 15ad603..d7d498d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -21,12 +21,10 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/rcupdate.h> -#include <linux/delay.h> #include <linux/ftrace.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <linux/tick.h> -#include <linux/locallock.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -64,98 +62,6 @@ char *softirq_to_name[NR_SOFTIRQS] = { "TASKLET", "SCHED", "HRTIMER", "RCU" }; -#ifdef CONFIG_NO_HZ_COMMON -# ifdef CONFIG_PREEMPT_RT_FULL - -struct softirq_runner { - struct task_struct *runner[NR_SOFTIRQS]; -}; - -static DEFINE_PER_CPU(struct softirq_runner, softirq_runners); - -static inline void softirq_set_runner(unsigned int sirq) -{ - struct softirq_runner *sr = &__get_cpu_var(softirq_runners); - - sr->runner[sirq] = current; -} - -static inline void softirq_clr_runner(unsigned int sirq) -{ - struct softirq_runner *sr = &__get_cpu_var(softirq_runners); - - sr->runner[sirq] = NULL; -} - -/* - * On preempt-rt a softirq running context might be blocked on a - * lock. There might be no other runnable task on this CPU because the - * lock owner runs on some other CPU. So we have to go into idle with - * the pending bit set. Therefor we need to check this otherwise we - * warn about false positives which confuses users and defeats the - * whole purpose of this test. - * - * This code is called with interrupts disabled. - */ -void softirq_check_pending_idle(void) -{ - static int rate_limit; - struct softirq_runner *sr = &__get_cpu_var(softirq_runners); - u32 warnpending; - int i; - - if (rate_limit >= 10) - return; - - warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK; - for (i = 0; i < NR_SOFTIRQS; i++) { - struct task_struct *tsk = sr->runner[i]; - - /* - * The wakeup code in rtmutex.c wakes up the task - * _before_ it sets pi_blocked_on to NULL under - * tsk->pi_lock. So we need to check for both: state - * and pi_blocked_on. - */ - if (tsk) { - raw_spin_lock(&tsk->pi_lock); - if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) { - /* Clear all bits pending in that task */ - warnpending &= ~(tsk->softirqs_raised); - warnpending &= ~(1 << i); - } - raw_spin_unlock(&tsk->pi_lock); - } - } - - if (warnpending) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - warnpending); - rate_limit++; - } -} -# else -/* - * On !PREEMPT_RT we just printk rate limited: - */ -void softirq_check_pending_idle(void) -{ - static int rate_limit; - - if (rate_limit < 10 && - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); - rate_limit++; - } -} -# endif - -#else /* !CONFIG_NO_HZ_COMMON */ -static inline void softirq_set_runner(unsigned int sirq) { } -static inline void softirq_clr_runner(unsigned int sirq) { } -#endif - /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -171,57 +77,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs) -{ - struct softirq_action *h = softirq_vec + vec_nr; - unsigned int prev_count = preempt_count(); - - kstat_incr_softirqs_this_cpu(vec_nr); - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - - if (unlikely(prev_count != preempt_count())) { - pr_err("softirq %u %s %p preempt count leak: %08x -> %08x\n", - vec_nr, softirq_to_name[vec_nr], h->action, - prev_count, (unsigned int) preempt_count()); - preempt_count() = prev_count; - } - if (need_rcu_bh_qs) - rcu_bh_qs(cpu); -} - -#ifndef CONFIG_PREEMPT_RT_FULL -static inline int ksoftirqd_softirq_pending(void) -{ - return local_softirq_pending(); -} - -static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs) -{ - unsigned int vec_nr; - - local_irq_enable(); - for (vec_nr = 0; pending; vec_nr++, pending >>= 1) { - if (pending & 1) - handle_softirq(vec_nr, cpu, need_rcu_bh_qs); - } - local_irq_disable(); -} - -static void run_ksoftirqd(unsigned int cpu) -{ - local_irq_disable(); - if (ksoftirqd_softirq_pending()) { - __do_softirq(); - rcu_note_context_switch(cpu); - local_irq_enable(); - cond_resched(); - return; - } - local_irq_enable(); -} - /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -354,51 +209,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) #define MAX_SOFTIRQ_RESTART 10 -#ifdef CONFIG_TRACE_IRQFLAGS -/* - * Convoluted means of passing __do_softirq() a message through the various - * architecture execute_on_stack() bits. - * - * When we run softirqs from irq_exit() and thus on the hardirq stack we need - * to keep the lockdep irq context tracking as tight as possible in order to - * not miss-qualify lock contexts and miss possible deadlocks. - */ -static DEFINE_PER_CPU(int, softirq_from_hardirq); - -static inline void lockdep_softirq_from_hardirq(void) -{ - this_cpu_write(softirq_from_hardirq, 1); -} - -static inline void lockdep_softirq_start(void) -{ - if (this_cpu_read(softirq_from_hardirq)) - trace_hardirq_exit(); - lockdep_softirq_enter(); -} - -static inline void lockdep_softirq_end(void) -{ - lockdep_softirq_exit(); - if (this_cpu_read(softirq_from_hardirq)) { - this_cpu_write(softirq_from_hardirq, 0); - trace_hardirq_enter(); - } -} - -#else -static inline void lockdep_softirq_from_hardirq(void) { } -static inline void lockdep_softirq_start(void) { } -static inline void lockdep_softirq_end(void) { } -#endif - asmlinkage void __do_softirq(void) { + struct softirq_action *h; + __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; + int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; - __u32 pending; - int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -411,14 +229,43 @@ asmlinkage void __do_softirq(void) account_irq_enter_time(current); __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_start(); + lockdep_softirq_enter(); cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - handle_pending_softirqs(pending, cpu, 1); + local_irq_enable(); + + h = softirq_vec; + + do { + if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; + int prev_count = preempt_count(); + + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %u %s %p" + "with preempt_count %08x," + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); + preempt_count() = prev_count; + } + + rcu_bh_qs(cpu); + } + h++; + pending >>= 1; + } while (pending); + + local_irq_disable(); pending = local_softirq_pending(); if (pending) { @@ -429,7 +276,8 @@ restart: wakeup_softirqd(); } - lockdep_softirq_end(); + lockdep_softirq_exit(); + account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); @@ -458,261 +306,6 @@ asmlinkage void do_softirq(void) #endif /* - * This function must run with irqs disabled! - */ -void raise_softirq_irqoff(unsigned int nr) -{ - __raise_softirq_irqoff(nr); - - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); -} - -void __raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); -} - -static inline void local_bh_disable_nort(void) { local_bh_disable(); } -static inline void _local_bh_enable_nort(void) { _local_bh_enable(); } -static void ksoftirqd_set_sched_params(unsigned int cpu) { } -static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { } - -#else /* !PREEMPT_RT_FULL */ - -/* - * On RT we serialize softirq execution with a cpu local lock per softirq - */ -static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks); - -void __init softirq_early_init(void) -{ - int i; - - for (i = 0; i < NR_SOFTIRQS; i++) - local_irq_lock_init(local_softirq_locks[i]); -} - -static void lock_softirq(int which) -{ - local_lock(local_softirq_locks[which]); -} - -static void unlock_softirq(int which) -{ - local_unlock(local_softirq_locks[which]); -} - -static void do_single_softirq(int which, int need_rcu_bh_qs) -{ - unsigned long old_flags = current->flags; - - current->flags &= ~PF_MEMALLOC; - vtime_account_irq_enter(current); - current->flags |= PF_IN_SOFTIRQ; - lockdep_softirq_enter(); - local_irq_enable(); - handle_softirq(which, smp_processor_id(), need_rcu_bh_qs); - local_irq_disable(); - lockdep_softirq_exit(); - current->flags &= ~PF_IN_SOFTIRQ; - vtime_account_irq_enter(current); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); -} - -/* - * Called with interrupts disabled. Process softirqs which were raised - * in current context (or on behalf of ksoftirqd). - */ -static void do_current_softirqs(int need_rcu_bh_qs) -{ - while (current->softirqs_raised) { - int i = __ffs(current->softirqs_raised); - unsigned int pending, mask = (1U << i); - - current->softirqs_raised &= ~mask; - local_irq_enable(); - - /* - * If the lock is contended, we boost the owner to - * process the softirq or leave the critical section - * now. - */ - lock_softirq(i); - local_irq_disable(); - softirq_set_runner(i); - /* - * Check with the local_softirq_pending() bits, - * whether we need to process this still or if someone - * else took care of it. - */ - pending = local_softirq_pending(); - if (pending & mask) { - set_softirq_pending(pending & ~mask); - do_single_softirq(i, need_rcu_bh_qs); - } - softirq_clr_runner(i); - unlock_softirq(i); - WARN_ON(current->softirq_nestcnt != 1); - } -} - -void local_bh_disable(void) -{ - if (++current->softirq_nestcnt == 1) - migrate_disable(); -} -EXPORT_SYMBOL(local_bh_disable); - -void local_bh_enable(void) -{ - if (WARN_ON(current->softirq_nestcnt == 0)) - return; - - local_irq_disable(); - if (current->softirq_nestcnt == 1 && current->softirqs_raised) - do_current_softirqs(1); - local_irq_enable(); - - if (--current->softirq_nestcnt == 0) - migrate_enable(); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - local_bh_enable(); -} -EXPORT_SYMBOL(local_bh_enable_ip); - -void _local_bh_enable(void) -{ - if (WARN_ON(current->softirq_nestcnt == 0)) - return; - if (--current->softirq_nestcnt == 0) - migrate_enable(); -} -EXPORT_SYMBOL(_local_bh_enable); - -int in_serving_softirq(void) -{ - return current->flags & PF_IN_SOFTIRQ; -} -EXPORT_SYMBOL(in_serving_softirq); - -/* Called with preemption disabled */ -static void run_ksoftirqd(unsigned int cpu) -{ - local_irq_disable(); - current->softirq_nestcnt++; - do_current_softirqs(1); - current->softirq_nestcnt--; - rcu_note_context_switch(cpu); - local_irq_enable(); -} - -/* - * Called from netif_rx_ni(). Preemption enabled, but migration - * disabled. So the cpu can't go away under us. - */ -void thread_do_softirq(void) -{ - if (!in_serving_softirq() && current->softirqs_raised) { - current->softirq_nestcnt++; - do_current_softirqs(0); - current->softirq_nestcnt--; - } -} - -static void do_raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); - - /* - * If we are not in a hard interrupt and inside a bh disabled - * region, we simply raise the flag on current. local_bh_enable() - * will make sure that the softirq is executed. Otherwise we - * delegate it to ksoftirqd. - */ - if (!in_irq() && current->softirq_nestcnt) - current->softirqs_raised |= (1U << nr); - else if (__this_cpu_read(ksoftirqd)) - __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr); -} - -void __raise_softirq_irqoff(unsigned int nr) -{ - do_raise_softirq_irqoff(nr); - if (!in_irq() && !current->softirq_nestcnt) - wakeup_softirqd(); -} - -/* - * This function must run with irqs disabled! - */ -void raise_softirq_irqoff(unsigned int nr) -{ - do_raise_softirq_irqoff(nr); - - /* - * If we're in an hard interrupt we let irq return code deal - * with the wakeup of ksoftirqd. - */ - if (in_irq()) - return; - - /* - * If we are in thread context but outside of a bh disabled - * region, we need to wake ksoftirqd as well. - * - * CHECKME: Some of the places which do that could be wrapped - * into local_bh_disable/enable pairs. Though it's unclear - * whether this is worth the effort. To find those places just - * raise a WARN() if the condition is met. - */ - if (!current->softirq_nestcnt) - wakeup_softirqd(); -} - -static inline int ksoftirqd_softirq_pending(void) -{ - return current->softirqs_raised; -} - -static inline void local_bh_disable_nort(void) { } -static inline void _local_bh_enable_nort(void) { } - -static inline void ksoftirqd_set_sched_params(unsigned int cpu) -{ - struct sched_param param = { .sched_priority = 1 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - /* Take over all pending softirqs when starting */ - local_irq_disable(); - current->softirqs_raised = local_softirq_pending(); - local_irq_enable(); -} - -static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) -{ - struct sched_param param = { .sched_priority = 0 }; - - sched_setscheduler(current, SCHED_NORMAL, ¶m); -} - -#endif /* PREEMPT_RT_FULL */ -/* * Enter an interrupt context. */ void irq_enter(void) @@ -725,9 +318,9 @@ void irq_enter(void) * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. */ - local_bh_disable_nort(); + local_bh_disable(); tick_check_idle(cpu); - _local_bh_enable_nort(); + _local_bh_enable(); } __irq_enter(); @@ -735,9 +328,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { -#ifndef CONFIG_PREEMPT_RT_FULL if (!force_irqthreads) { - lockdep_softirq_from_hardirq(); /* * We can safely execute softirq on the current stack if * it is the irq stack, because it should be near empty @@ -750,15 +341,6 @@ static inline void invoke_softirq(void) } else { wakeup_softirqd(); } -#else /* PREEMPT_RT_FULL */ - unsigned long flags; - - local_irq_save(flags); - if (__this_cpu_read(ksoftirqd) && - __this_cpu_read(ksoftirqd)->softirqs_raised) - wakeup_softirqd(); - local_irq_restore(flags); -#endif } static inline void tick_irq_exit(void) @@ -786,13 +368,33 @@ void irq_exit(void) #endif account_irq_exit_time(current); + trace_hardirq_exit(); sub_preempt_count(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); rcu_irq_exit(); - trace_hardirq_exit(); /* must be last! */ +} + +/* + * This function must run with irqs disabled! + */ +inline void raise_softirq_irqoff(unsigned int nr) +{ + __raise_softirq_irqoff(nr); + + /* + * If we're in an interrupt or softirq, we're done + * (this also catches softirq-disabled code). We will + * actually run the softirq once we return from + * the irq or softirq. + * + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ + if (!in_interrupt()) + wakeup_softirqd(); } void raise_softirq(unsigned int nr) @@ -804,6 +406,12 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + void open_softirq(int nr, void (*action)(struct softirq_action *)) { softirq_vec[nr].action = action; @@ -821,45 +429,15 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); -static void inline -__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) -{ - if (tasklet_trylock(t)) { -again: - /* We may have been preempted before tasklet_trylock - * and __tasklet_action may have already run. - * So double check the sched bit while the takslet - * is locked before adding it to the list. - */ - if (test_bit(TASKLET_STATE_SCHED, &t->state)) { - t->next = NULL; - *head->tail = t; - head->tail = &(t->next); - raise_softirq_irqoff(nr); - tasklet_unlock(t); - } else { - /* This is subtle. If we hit the corner case above - * It is possible that we get preempted right here, - * and another task has successfully called - * tasklet_schedule(), then this function, and - * failed on the trylock. Thus we must be sure - * before releasing the tasklet lock, that the - * SCHED_BIT is clear. Otherwise the tasklet - * may get its SCHED_BIT set, but not added to the - * list - */ - if (!tasklet_tryunlock(t)) - goto again; - } - } -} - void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); + t->next = NULL; + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); + raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -870,7 +448,10 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); + t->next = NULL; + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); + raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } @@ -878,117 +459,48 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - __tasklet_hi_schedule(t); -} + BUG_ON(!irqs_disabled()); -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - -void tasklet_enable(struct tasklet_struct *t) -{ - if (!atomic_dec_and_test(&t->count)) - return; - if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) - tasklet_schedule(t); + t->next = __this_cpu_read(tasklet_hi_vec.head); + __this_cpu_write(tasklet_hi_vec.head, t); + __raise_softirq_irqoff(HI_SOFTIRQ); } -EXPORT_SYMBOL(tasklet_enable); +EXPORT_SYMBOL(__tasklet_hi_schedule_first); -void tasklet_hi_enable(struct tasklet_struct *t) +static void tasklet_action(struct softirq_action *a) { - if (!atomic_dec_and_test(&t->count)) - return; - if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) - tasklet_hi_schedule(t); -} - -EXPORT_SYMBOL(tasklet_hi_enable); + struct tasklet_struct *list; -static void -__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) -{ - int loops = 1000000; + local_irq_disable(); + list = __this_cpu_read(tasklet_vec.head); + __this_cpu_write(tasklet_vec.head, NULL); + __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); + local_irq_enable(); while (list) { struct tasklet_struct *t = list; list = list->next; - /* - * Should always succeed - after a tasklist got on the - * list (after getting the SCHED bit set from 0 to 1), - * nothing but the tasklet softirq it got queued to can - * lock it: - */ - if (!tasklet_trylock(t)) { - WARN_ON(1); - continue; - } - - t->next = NULL; - - /* - * If we cannot handle the tasklet because it's disabled, - * mark it as pending. tasklet_enable() will later - * re-schedule the tasklet. - */ - if (unlikely(atomic_read(&t->count))) { -out_disabled: - /* implicit unlock: */ - wmb(); - t->state = TASKLET_STATEF_PENDING; - continue; - } - - /* - * After this point on the tasklet might be rescheduled - * on another CPU, but it can only be added to another - * CPU's tasklet list if we unlock the tasklet (which we - * dont do yet). - */ - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - WARN_ON(1); - -again: - t->func(t->data); - - /* - * Try to unlock the tasklet. We must use cmpxchg, because - * another CPU might have scheduled or disabled the tasklet. - * We only allow the STATE_RUN -> 0 transition here. - */ - while (!tasklet_tryunlock(t)) { - /* - * If it got disabled meanwhile, bail out: - */ - if (atomic_read(&t->count)) - goto out_disabled; - /* - * If it got scheduled meanwhile, re-execute - * the tasklet function: - */ - if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - goto again; - if (!--loops) { - printk("hm, tasklet state: %08lx\n", t->state); - WARN_ON(1); + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); tasklet_unlock(t); - break; + continue; } + tasklet_unlock(t); } - } -} - -static void tasklet_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; - local_irq_enable(); - - __tasklet_action(a, list); + local_irq_disable(); + t->next = NULL; + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); + __raise_softirq_irqoff(TASKLET_SOFTIRQ); + local_irq_enable(); + } } static void tasklet_hi_action(struct softirq_action *a) @@ -1001,7 +513,29 @@ static void tasklet_hi_action(struct softirq_action *a) __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); local_irq_enable(); - __tasklet_action(a, list); + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + local_irq_disable(); + t->next = NULL; + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); + __raise_softirq_irqoff(HI_SOFTIRQ); + local_irq_enable(); + } } @@ -1024,7 +558,7 @@ void tasklet_kill(struct tasklet_struct *t) while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do { - msleep(1); + yield(); } while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -1228,26 +762,22 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) -void tasklet_unlock_wait(struct tasklet_struct *t) +static int ksoftirqd_should_run(unsigned int cpu) { - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { - /* - * Hack for now to avoid this busy-loop: - */ -#ifdef CONFIG_PREEMPT_RT_FULL - msleep(1); -#else - barrier(); -#endif - } + return local_softirq_pending(); } -EXPORT_SYMBOL(tasklet_unlock_wait); -#endif -static int ksoftirqd_should_run(unsigned int cpu) +static void run_ksoftirqd(unsigned int cpu) { - return ksoftirqd_softirq_pending(); + local_irq_disable(); + if (local_softirq_pending()) { + __do_softirq(); + rcu_note_context_switch(cpu); + local_irq_enable(); + cond_resched(); + return; + } + local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU @@ -1330,8 +860,6 @@ static struct notifier_block cpu_nfb = { static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, - .setup = ksoftirqd_set_sched_params, - .cleanup = ksoftirqd_clr_sched_params, .thread_should_run = ksoftirqd_should_run, .thread_fn = run_ksoftirqd, .thread_comm = "ksoftirqd/%u", diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5c76166..4b082b5 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -124,11 +124,8 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); - -#ifndef CONFIG_PREEMPT_RT_FULL BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); -#endif #endif @@ -212,8 +209,6 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif -#ifndef CONFIG_PREEMPT_RT_FULL - #ifndef CONFIG_INLINE_READ_TRYLOCK int __lockfunc _raw_read_trylock(rwlock_t *lock) { @@ -358,8 +353,6 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_raw_write_unlock_bh); #endif -#endif /* !PREEMPT_RT_FULL */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5f02a3f..c09f295 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -29,12 +29,12 @@ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ bool executed; /* actually executed? */ int ret; /* collected return value */ - struct task_struct *waiter; /* woken when nr_todo reaches 0 */ + struct completion completion; /* fired if nr_todo reaches 0 */ }; /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { - raw_spinlock_t lock; + spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ }; @@ -47,7 +47,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); atomic_set(&done->nr_todo, nr_todo); - done->waiter = current; + init_completion(&done->completion); } /* signal completion unless @done is NULL */ @@ -56,10 +56,8 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) if (done) { if (executed) done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) { - wake_up_process(done->waiter); - done->waiter = NULL; - } + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); } } @@ -71,7 +69,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; - raw_spin_lock_irqsave(&stopper->lock, flags); + spin_lock_irqsave(&stopper->lock, flags); if (stopper->enabled) { list_add_tail(&work->list, &stopper->works); @@ -79,23 +77,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) } else cpu_stop_signal_done(work->done, false); - raw_spin_unlock_irqrestore(&stopper->lock, flags); -} - -static void wait_for_stop_done(struct cpu_stop_done *done) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - while (atomic_read(&done->nr_todo)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - /* - * We need to wait until cpu_stop_signal_done() has cleared - * done->waiter. - */ - while (done->waiter) - cpu_relax(); - set_current_state(TASK_RUNNING); + spin_unlock_irqrestore(&stopper->lock, flags); } /** @@ -129,7 +111,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); cpu_stop_queue_work(cpu, &work); - wait_for_stop_done(&done); + wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -155,12 +137,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_MUTEX(stopper_lock); static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, - struct cpu_stop_done *done, bool inactive) + struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; @@ -174,18 +155,14 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, } /* - * Make sure that all work is queued on all cpus before we - * any of the cpus can execute it. + * Disable preemption while queueing to avoid getting + * preempted by a stopper which might wait for other stoppers + * to enter @fn which can lead to deadlock. */ - if (!inactive) { - mutex_lock(&stopper_lock); - } else { - while (!mutex_trylock(&stopper_lock)) - cpu_relax(); - } + preempt_disable(); for_each_cpu(cpu, cpumask) cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); - mutex_unlock(&stopper_lock); + preempt_enable(); } static int __stop_cpus(const struct cpumask *cpumask, @@ -194,8 +171,8 @@ static int __stop_cpus(const struct cpumask *cpumask, struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done, false); - wait_for_stop_done(&done); + queue_stop_cpus_work(cpumask, fn, arg, &done); + wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -274,9 +251,9 @@ static int cpu_stop_should_run(unsigned int cpu) unsigned long flags; int run; - raw_spin_lock_irqsave(&stopper->lock, flags); + spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); - raw_spin_unlock_irqrestore(&stopper->lock, flags); + spin_unlock_irqrestore(&stopper->lock, flags); return run; } @@ -288,13 +265,13 @@ static void cpu_stopper_thread(unsigned int cpu) repeat: work = NULL; - raw_spin_lock_irq(&stopper->lock); + spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } - raw_spin_unlock_irq(&stopper->lock); + spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; @@ -302,16 +279,6 @@ repeat: struct cpu_stop_done *done = work->done; char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - /* - * Wait until the stopper finished scheduling on all - * cpus - */ - mutex_lock(&stopper_lock); - /* - * Let other cpu threads continue as well - */ - mutex_unlock(&stopper_lock); - /* cpu stop callbacks are not allowed to sleep */ preempt_disable(); @@ -326,13 +293,7 @@ repeat: kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, ksym_buf), arg); - /* - * Make sure that the wakeup and setting done->waiter - * to NULL is atomic. - */ - local_irq_disable(); cpu_stop_signal_done(done, true); - local_irq_enable(); goto repeat; } } @@ -351,20 +312,20 @@ static void cpu_stop_park(unsigned int cpu) unsigned long flags; /* drain remaining works */ - raw_spin_lock_irqsave(&stopper->lock, flags); + spin_lock_irqsave(&stopper->lock, flags); list_for_each_entry(work, &stopper->works, list) cpu_stop_signal_done(work->done, false); stopper->enabled = false; - raw_spin_unlock_irqrestore(&stopper->lock, flags); + spin_unlock_irqrestore(&stopper->lock, flags); } static void cpu_stop_unpark(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - raw_spin_lock_irq(&stopper->lock); + spin_lock_irq(&stopper->lock); stopper->enabled = true; - raw_spin_unlock_irq(&stopper->lock); + spin_unlock_irq(&stopper->lock); } static struct smp_hotplug_thread cpu_stop_threads = { @@ -386,7 +347,7 @@ static int __init cpu_stop_init(void) for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - raw_spin_lock_init(&stopper->lock); + spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } @@ -569,11 +530,11 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, set_state(&smdata, STOPMACHINE_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, - &done, true); + &done); ret = stop_machine_cpu_stop(&smdata); /* Busy wait for completion. */ - while (atomic_read(&done.nr_todo)) + while (!completion_done(&done.completion)) cpu_relax(); mutex_unlock(&stop_cpus_mutex); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 23d7203..a6a5bf5 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -73,8 +73,7 @@ static struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; -__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); -__cacheline_aligned_in_smp seqcount_t jiffies_seq; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -83,9 +82,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqcount_begin(&jiffies_seq); + seq = read_seqbegin(&jiffies_lock); ret = jiffies_64; - } while (read_seqcount_retry(&jiffies_seq, seq)); + } while (read_seqretry(&jiffies_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d6132cd..af8d1d4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,7 +10,6 @@ #include <linux/workqueue.h> #include <linux/hrtimer.h> #include <linux/jiffies.h> -#include <linux/kthread.h> #include <linux/math64.h> #include <linux/timex.h> #include <linux/time.h> @@ -518,49 +517,10 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * RT can not call schedule_delayed_work from real interrupt context. - * Need to make a thread to do the real work. - */ -static struct task_struct *cmos_delay_thread; -static bool do_cmos_delay; - -static int run_cmos_delay(void *ignore) -{ - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (do_cmos_delay) { - do_cmos_delay = false; - schedule_delayed_work(&sync_cmos_work, 0); - } - schedule(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -void ntp_notify_cmos_timer(void) -{ - do_cmos_delay = true; - /* Make visible before waking up process */ - smp_wmb(); - wake_up_process(cmos_delay_thread); -} - -static __init int create_cmos_delay_thread(void) -{ - cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd"); - BUG_ON(!cmos_delay_thread); - return 0; -} -early_initcall(create_cmos_delay_thread); -#else void ntp_notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } -#endif /* CONFIG_PREEMPT_RT_FULL */ #else void ntp_notify_cmos_timer(void) { } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1b80eb0..64522ec 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -63,15 +63,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - raw_spin_lock(&jiffies_lock); - write_seqcount_begin(&jiffies_seq); + write_seqlock(&jiffies_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); + write_sequnlock(&jiffies_lock); } update_process_times(user_mode(get_irq_regs())); @@ -132,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqcount_begin(&jiffies_seq); + seq = read_seqbegin(&jiffies_lock); next = tick_next_period; - } while (read_seqcount_retry(&jiffies_seq, seq)); + } while (read_seqretry(&jiffies_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7e5e7f8..bc906ca 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,8 +4,7 @@ #include <linux/hrtimer.h> #include <linux/tick.h> -extern raw_spinlock_t jiffies_lock; -extern seqcount_t jiffies_seq; +extern seqlock_t jiffies_lock; #define CS_NAME_LEN 32 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3740f28..ea20f7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -62,8 +62,7 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevalute with jiffies_lock held */ - raw_spin_lock(&jiffies_lock); - write_seqcount_begin(&jiffies_seq); + write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -86,8 +85,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); + write_sequnlock(&jiffies_lock); } /* @@ -97,14 +95,12 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - raw_spin_lock(&jiffies_lock); - write_seqcount_begin(&jiffies_seq); + write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); + write_sequnlock(&jiffies_lock); return period; } @@ -221,7 +217,6 @@ static void nohz_full_kick_work_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_work_func, - .flags = IRQ_WORK_HARD_IRQ, }; /* @@ -543,11 +538,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqcount_begin(&jiffies_seq); + seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); - } while (read_seqcount_retry(&jiffies_seq, seq)); + } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || arch_needs_cpu(cpu) || irq_work_needs_cpu()) { @@ -725,7 +720,14 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - softirq_check_pending_idle(); + static int ratelimit; + + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } return false; } @@ -1110,7 +1112,6 @@ void tick_setup_sched_timer(void) * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ts->sched_timer.irqsafe = 1; ts->sched_timer.function = tick_sched_timer; /* Get the next period (per cpu) */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3150a7..bfca770 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1754,9 +1754,7 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - raw_spin_lock(&jiffies_lock); - write_seqcount_begin(&jiffies_seq); + write_seqlock(&jiffies_lock); do_timer(ticks); - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); + write_sequnlock(&jiffies_lock); } diff --git a/kernel/timer.c b/kernel/timer.c index cc34e42..4296d13 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -78,9 +78,6 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; -#ifdef CONFIG_PREEMPT_RT_FULL - wait_queue_head_t wait_for_running_timer; -#endif unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; @@ -723,36 +720,6 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } } -#ifndef CONFIG_PREEMPT_RT_FULL -static inline struct tvec_base *switch_timer_base(struct timer_list *timer, - struct tvec_base *old, - struct tvec_base *new) -{ - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&old->lock); - spin_lock(&new->lock); - timer_set_base(timer, new); - return new; -} -#else -static inline struct tvec_base *switch_timer_base(struct timer_list *timer, - struct tvec_base *old, - struct tvec_base *new) -{ - /* - * We cannot do the above because we might be preempted and - * then the preempter would see NULL and loop forever. - */ - if (spin_trylock(&new->lock)) { - timer_set_base(timer, new); - spin_unlock(&old->lock); - return new; - } - return old; -} -#endif - static inline int __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only, int pinned) @@ -772,15 +739,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - preempt_disable_rt(); cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) cpu = get_nohz_timer_target(); #endif - preempt_enable_rt(); - new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { @@ -791,8 +755,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, * handler yet has not finished. This also guarantees that * the timer is serialized wrt itself. */ - if (likely(base->running_timer != timer)) - base = switch_timer_base(timer, base, new_base); + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } } timer->expires = expires; @@ -975,29 +945,6 @@ void add_timer_on(struct timer_list *timer, int cpu) } EXPORT_SYMBOL_GPL(add_timer_on); -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * Wait for a running timer - */ -static void wait_for_running_timer(struct timer_list *timer) -{ - struct tvec_base *base = timer->base; - - if (base->running_timer == timer) - wait_event(base->wait_for_running_timer, - base->running_timer != timer); -} - -# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer) -#else -static inline void wait_for_running_timer(struct timer_list *timer) -{ - cpu_relax(); -} - -# define wakeup_timer_waiters(b) do { } while (0) -#endif - /** * del_timer - deactive a timer. * @timer: the timer to be deactivated @@ -1055,7 +1002,7 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) +#ifdef CONFIG_SMP /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1115,7 +1062,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - wait_for_running_timer(timer); + cpu_relax(); } } EXPORT_SYMBOL(del_timer_sync); @@ -1232,17 +1179,15 @@ static inline void __run_timers(struct tvec_base *base) if (irqsafe) { spin_unlock(&base->lock); call_timer_fn(timer, fn, data); - base->running_timer = NULL; spin_lock(&base->lock); } else { spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); - base->running_timer = NULL; spin_lock_irq(&base->lock); } } } - wakeup_timer_waiters(base); + base->running_timer = NULL; spin_unlock_irq(&base->lock); } @@ -1382,31 +1327,17 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (cpu_is_offline(smp_processor_id())) return expires; -#ifdef CONFIG_PREEMPT_RT_FULL - /* - * On PREEMPT_RT we cannot sleep here. If the trylock does not - * succeed then we return the worst-case 'expires in 1 tick' - * value. We use the rt functions here directly to avoid a - * migrate_disable() call. - */ - if (!spin_do_trylock(&base->lock)) - return now + 1; -#else spin_lock(&base->lock); -#endif if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); expires = base->next_timer; } -#ifdef CONFIG_PREEMPT_RT_FULL - rt_spin_unlock_after_trylock_in_irq(&base->lock); -#else spin_unlock(&base->lock); -#endif if (time_before_eq(expires, now)) return now; + return cmp_next_hrtimer_event(now, expires); } #endif @@ -1422,13 +1353,13 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); - scheduler_tick(); run_local_timers(); rcu_check_callbacks(cpu, user_tick); -#if defined(CONFIG_IRQ_WORK) +#ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_run(); #endif + scheduler_tick(); run_posix_cpu_timers(p); } @@ -1439,9 +1370,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __this_cpu_read(tvec_bases); -#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) - irq_work_run(); -#endif + hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1452,39 +1381,8 @@ static void run_timer_softirq(struct softirq_action *h) */ void run_local_timers(void) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - hrtimer_run_queues(); - /* - * We can access this lockless as we are in the timer - * interrupt. If there are no timers queued, nothing to do in - * the timer softirq. - */ -#ifdef CONFIG_PREEMPT_RT_FULL - /* On RT, irq work runs from softirq */ - if (irq_work_needs_cpu()) { - raise_softirq(TIMER_SOFTIRQ); - return; - } - - if (!spin_do_trylock(&base->lock)) { - raise_softirq(TIMER_SOFTIRQ); - return; - } -#endif - - if (!base->active_timers) - goto out; - - /* Check whether the next pending timer has expired */ - if (time_before_eq(base->next_timer, jiffies)) - raise_softirq(TIMER_SOFTIRQ); -out: -#ifdef CONFIG_PREEMPT_RT_FULL - rt_spin_unlock_after_trylock_in_irq(&base->lock); -#endif - /* The ; ensures that gcc won't complain in the !RT case */ - ; + raise_softirq(TIMER_SOFTIRQ); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1649,9 +1547,6 @@ static int init_timers_cpu(int cpu) base = per_cpu(tvec_bases, cpu); } -#ifdef CONFIG_PREEMPT_RT_FULL - init_waitqueue_head(&base->wait_for_running_timer); -#endif for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1690,7 +1585,7 @@ static void migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); - new_base = get_local_var(tvec_bases); + new_base = get_cpu_var(tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1711,7 +1606,7 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_local_var(tvec_bases); + put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bbe95b9..015f85a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -192,24 +192,6 @@ config IRQSOFF_TRACER enabled. This option and the preempt-off timing option can be used together or separately.) -config INTERRUPT_OFF_HIST - bool "Interrupts-off Latency Histogram" - depends on IRQSOFF_TRACER - help - This option generates continuously updated histograms (one per cpu) - of the duration of time periods with interrupts disabled. The - histograms are disabled by default. To enable them, write a non-zero - number to - - /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff - - If PREEMPT_OFF_HIST is also selected, additional histograms (one - per cpu) are generated that accumulate the duration of time periods - when both interrupts and preemption are disabled. The histogram data - will be located in the debug file system at - - /sys/kernel/debug/tracing/latency_hist/irqsoff - config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n @@ -234,24 +216,6 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config PREEMPT_OFF_HIST - bool "Preemption-off Latency Histogram" - depends on PREEMPT_TRACER - help - This option generates continuously updated histograms (one per cpu) - of the duration of time periods with preemption disabled. The - histograms are disabled by default. To enable them, write a non-zero - number to - - /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff - - If INTERRUPT_OFF_HIST is also selected, additional histograms (one - per cpu) are generated that accumulate the duration of time periods - when both interrupts and preemption are disabled. The histogram data - will be located in the debug file system at - - /sys/kernel/debug/tracing/latency_hist/preemptoff - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER @@ -262,74 +226,6 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config WAKEUP_LATENCY_HIST - bool "Scheduling Latency Histogram" - depends on SCHED_TRACER - help - This option generates continuously updated histograms (one per cpu) - of the scheduling latency of the highest priority task. - The histograms are disabled by default. To enable them, write a - non-zero number to - - /sys/kernel/debug/tracing/latency_hist/enable/wakeup - - Two different algorithms are used, one to determine the latency of - processes that exclusively use the highest priority of the system and - another one to determine the latency of processes that share the - highest system priority with other processes. The former is used to - improve hardware and system software, the latter to optimize the - priority design of a given system. The histogram data will be - located in the debug file system at - - /sys/kernel/debug/tracing/latency_hist/wakeup - - and - - /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio - - If both Scheduling Latency Histogram and Missed Timer Offsets - Histogram are selected, additional histogram data will be collected - that contain, in addition to the wakeup latency, the timer latency, in - case the wakeup was triggered by an expired timer. These histograms - are available in the - - /sys/kernel/debug/tracing/latency_hist/timerandwakeup - - directory. They reflect the apparent interrupt and scheduling latency - and are best suitable to determine the worst-case latency of a given - system. To enable these histograms, write a non-zero number to - - /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup - -config MISSED_TIMER_OFFSETS_HIST - depends on HIGH_RES_TIMERS - select GENERIC_TRACER - bool "Missed Timer Offsets Histogram" - help - Generate a histogram of missed timer offsets in microseconds. The - histograms are disabled by default. To enable them, write a non-zero - number to - - /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets - - The histogram data will be located in the debug file system at - - /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets - - If both Scheduling Latency Histogram and Missed Timer Offsets - Histogram are selected, additional histogram data will be collected - that contain, in addition to the wakeup latency, the timer latency, in - case the wakeup was triggered by an expired timer. These histograms - are available in the - - /sys/kernel/debug/tracing/latency_hist/timerandwakeup - - directory. They reflect the apparent interrupt and scheduling latency - and are best suitable to determine the worst-case latency of a given - system. To enable these histograms, write a non-zero number to - - /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup - config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f5e0243..d7e2068 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -34,10 +34,6 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o -obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o -obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o -obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o -obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c deleted file mode 100644 index 66a69eb..0000000 --- a/kernel/trace/latency_hist.c +++ /dev/null @@ -1,1178 +0,0 @@ -/* - * kernel/trace/latency_hist.c - * - * Add support for histograms of preemption-off latency and - * interrupt-off latency and wakeup latency, it depends on - * Real-Time Preemption Support. - * - * Copyright (C) 2005 MontaVista Software, Inc. - * Yi Yang <yyang@ch.mvista.com> - * - * Converted to work with the new latency tracer. - * Copyright (C) 2008 Red Hat, Inc. - * Steven Rostedt <srostedt@redhat.com> - * - */ -#include <linux/module.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> -#include <linux/percpu.h> -#include <linux/kallsyms.h> -#include <linux/uaccess.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/slab.h> -#include <linux/atomic.h> -#include <asm/div64.h> - -#include "trace.h" -#include <trace/events/sched.h> - -#define NSECS_PER_USECS 1000L - -#define CREATE_TRACE_POINTS -#include <trace/events/hist.h> - -enum { - IRQSOFF_LATENCY = 0, - PREEMPTOFF_LATENCY, - PREEMPTIRQSOFF_LATENCY, - WAKEUP_LATENCY, - WAKEUP_LATENCY_SHAREDPRIO, - MISSED_TIMER_OFFSETS, - TIMERANDWAKEUP_LATENCY, - MAX_LATENCY_TYPE, -}; - -#define MAX_ENTRY_NUM 10240 - -struct hist_data { - atomic_t hist_mode; /* 0 log, 1 don't log */ - long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */ - long min_lat; - long max_lat; - unsigned long long below_hist_bound_samples; - unsigned long long above_hist_bound_samples; - long long accumulate_lat; - unsigned long long total_samples; - unsigned long long hist_array[MAX_ENTRY_NUM]; -}; - -struct enable_data { - int latency_type; - int enabled; -}; - -static char *latency_hist_dir_root = "latency_hist"; - -#ifdef CONFIG_INTERRUPT_OFF_HIST -static DEFINE_PER_CPU(struct hist_data, irqsoff_hist); -static char *irqsoff_hist_dir = "irqsoff"; -static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start); -static DEFINE_PER_CPU(int, hist_irqsoff_counting); -#endif - -#ifdef CONFIG_PREEMPT_OFF_HIST -static DEFINE_PER_CPU(struct hist_data, preemptoff_hist); -static char *preemptoff_hist_dir = "preemptoff"; -static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start); -static DEFINE_PER_CPU(int, hist_preemptoff_counting); -#endif - -#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) -static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist); -static char *preemptirqsoff_hist_dir = "preemptirqsoff"; -static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start); -static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting); -#endif - -#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST) -static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start); -static struct enable_data preemptirqsoff_enabled_data = { - .latency_type = PREEMPTIRQSOFF_LATENCY, - .enabled = 0, -}; -#endif - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -struct maxlatproc_data { - char comm[FIELD_SIZEOF(struct task_struct, comm)]; - char current_comm[FIELD_SIZEOF(struct task_struct, comm)]; - int pid; - int current_pid; - int prio; - int current_prio; - long latency; - long timeroffset; - cycle_t timestamp; -}; -#endif - -#ifdef CONFIG_WAKEUP_LATENCY_HIST -static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist); -static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio); -static char *wakeup_latency_hist_dir = "wakeup"; -static char *wakeup_latency_hist_dir_sharedprio = "sharedprio"; -static notrace void probe_wakeup_latency_hist_start(void *v, - struct task_struct *p, int success); -static notrace void probe_wakeup_latency_hist_stop(void *v, - struct task_struct *prev, struct task_struct *next); -static notrace void probe_sched_migrate_task(void *, - struct task_struct *task, int cpu); -static struct enable_data wakeup_latency_enabled_data = { - .latency_type = WAKEUP_LATENCY, - .enabled = 0, -}; -static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc); -static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio); -static DEFINE_PER_CPU(struct task_struct *, wakeup_task); -static DEFINE_PER_CPU(int, wakeup_sharedprio); -static unsigned long wakeup_pid; -#endif - -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST -static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets); -static char *missed_timer_offsets_dir = "missed_timer_offsets"; -static notrace void probe_hrtimer_interrupt(void *v, int cpu, - long long offset, struct task_struct *curr, struct task_struct *task); -static struct enable_data missed_timer_offsets_enabled_data = { - .latency_type = MISSED_TIMER_OFFSETS, - .enabled = 0, -}; -static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc); -static unsigned long missed_timer_offsets_pid; -#endif - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist); -static char *timerandwakeup_latency_hist_dir = "timerandwakeup"; -static struct enable_data timerandwakeup_enabled_data = { - .latency_type = TIMERANDWAKEUP_LATENCY, - .enabled = 0, -}; -static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc); -#endif - -void notrace latency_hist(int latency_type, int cpu, long latency, - long timeroffset, cycle_t stop, - struct task_struct *p) -{ - struct hist_data *my_hist; -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - struct maxlatproc_data *mp = NULL; -#endif - - if (!cpu_possible(cpu) || latency_type < 0 || - latency_type >= MAX_LATENCY_TYPE) - return; - - switch (latency_type) { -#ifdef CONFIG_INTERRUPT_OFF_HIST - case IRQSOFF_LATENCY: - my_hist = &per_cpu(irqsoff_hist, cpu); - break; -#endif -#ifdef CONFIG_PREEMPT_OFF_HIST - case PREEMPTOFF_LATENCY: - my_hist = &per_cpu(preemptoff_hist, cpu); - break; -#endif -#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) - case PREEMPTIRQSOFF_LATENCY: - my_hist = &per_cpu(preemptirqsoff_hist, cpu); - break; -#endif -#ifdef CONFIG_WAKEUP_LATENCY_HIST - case WAKEUP_LATENCY: - my_hist = &per_cpu(wakeup_latency_hist, cpu); - mp = &per_cpu(wakeup_maxlatproc, cpu); - break; - case WAKEUP_LATENCY_SHAREDPRIO: - my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu); - mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu); - break; -#endif -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - case MISSED_TIMER_OFFSETS: - my_hist = &per_cpu(missed_timer_offsets, cpu); - mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu); - break; -#endif -#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - case TIMERANDWAKEUP_LATENCY: - my_hist = &per_cpu(timerandwakeup_latency_hist, cpu); - mp = &per_cpu(timerandwakeup_maxlatproc, cpu); - break; -#endif - - default: - return; - } - - latency += my_hist->offset; - - if (atomic_read(&my_hist->hist_mode) == 0) - return; - - if (latency < 0 || latency >= MAX_ENTRY_NUM) { - if (latency < 0) - my_hist->below_hist_bound_samples++; - else - my_hist->above_hist_bound_samples++; - } else - my_hist->hist_array[latency]++; - - if (unlikely(latency > my_hist->max_lat || - my_hist->min_lat == LONG_MAX)) { -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - if (latency_type == WAKEUP_LATENCY || - latency_type == WAKEUP_LATENCY_SHAREDPRIO || - latency_type == MISSED_TIMER_OFFSETS || - latency_type == TIMERANDWAKEUP_LATENCY) { - strncpy(mp->comm, p->comm, sizeof(mp->comm)); - strncpy(mp->current_comm, current->comm, - sizeof(mp->current_comm)); - mp->pid = task_pid_nr(p); - mp->current_pid = task_pid_nr(current); - mp->prio = p->prio; - mp->current_prio = current->prio; - mp->latency = latency; - mp->timeroffset = timeroffset; - mp->timestamp = stop; - } -#endif - my_hist->max_lat = latency; - } - if (unlikely(latency < my_hist->min_lat)) - my_hist->min_lat = latency; - my_hist->total_samples++; - my_hist->accumulate_lat += latency; -} - -static void *l_start(struct seq_file *m, loff_t *pos) -{ - loff_t *index_ptr = NULL; - loff_t index = *pos; - struct hist_data *my_hist = m->private; - - if (index == 0) { - char minstr[32], avgstr[32], maxstr[32]; - - atomic_dec(&my_hist->hist_mode); - - if (likely(my_hist->total_samples)) { - long avg = (long) div64_s64(my_hist->accumulate_lat, - my_hist->total_samples); - snprintf(minstr, sizeof(minstr), "%ld", - my_hist->min_lat - my_hist->offset); - snprintf(avgstr, sizeof(avgstr), "%ld", - avg - my_hist->offset); - snprintf(maxstr, sizeof(maxstr), "%ld", - my_hist->max_lat - my_hist->offset); - } else { - strcpy(minstr, "<undef>"); - strcpy(avgstr, minstr); - strcpy(maxstr, minstr); - } - - seq_printf(m, "#Minimum latency: %s microseconds\n" - "#Average latency: %s microseconds\n" - "#Maximum latency: %s microseconds\n" - "#Total samples: %llu\n" - "#There are %llu samples lower than %ld" - " microseconds.\n" - "#There are %llu samples greater or equal" - " than %ld microseconds.\n" - "#usecs\t%16s\n", - minstr, avgstr, maxstr, - my_hist->total_samples, - my_hist->below_hist_bound_samples, - -my_hist->offset, - my_hist->above_hist_bound_samples, - MAX_ENTRY_NUM - my_hist->offset, - "samples"); - } - if (index < MAX_ENTRY_NUM) { - index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); - if (index_ptr) - *index_ptr = index; - } - - return index_ptr; -} - -static void *l_next(struct seq_file *m, void *p, loff_t *pos) -{ - loff_t *index_ptr = p; - struct hist_data *my_hist = m->private; - - if (++*pos >= MAX_ENTRY_NUM) { - atomic_inc(&my_hist->hist_mode); - return NULL; - } - *index_ptr = *pos; - return index_ptr; -} - -static void l_stop(struct seq_file *m, void *p) -{ - kfree(p); -} - -static int l_show(struct seq_file *m, void *p) -{ - int index = *(loff_t *) p; - struct hist_data *my_hist = m->private; - - seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset, - my_hist->hist_array[index]); - return 0; -} - -static const struct seq_operations latency_hist_seq_op = { - .start = l_start, - .next = l_next, - .stop = l_stop, - .show = l_show -}; - -static int latency_hist_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = seq_open(file, &latency_hist_seq_op); - if (!ret) { - struct seq_file *seq = file->private_data; - seq->private = inode->i_private; - } - return ret; -} - -static const struct file_operations latency_hist_fops = { - .open = latency_hist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -static void clear_maxlatprocdata(struct maxlatproc_data *mp) -{ - mp->comm[0] = mp->current_comm[0] = '\0'; - mp->prio = mp->current_prio = mp->pid = mp->current_pid = - mp->latency = mp->timeroffset = -1; - mp->timestamp = 0; -} -#endif - -static void hist_reset(struct hist_data *hist) -{ - atomic_dec(&hist->hist_mode); - - memset(hist->hist_array, 0, sizeof(hist->hist_array)); - hist->below_hist_bound_samples = 0ULL; - hist->above_hist_bound_samples = 0ULL; - hist->min_lat = LONG_MAX; - hist->max_lat = LONG_MIN; - hist->total_samples = 0ULL; - hist->accumulate_lat = 0LL; - - atomic_inc(&hist->hist_mode); -} - -static ssize_t -latency_hist_reset(struct file *file, const char __user *a, - size_t size, loff_t *off) -{ - int cpu; - struct hist_data *hist = NULL; -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - struct maxlatproc_data *mp = NULL; -#endif - off_t latency_type = (off_t) file->private_data; - - for_each_online_cpu(cpu) { - - switch (latency_type) { -#ifdef CONFIG_PREEMPT_OFF_HIST - case PREEMPTOFF_LATENCY: - hist = &per_cpu(preemptoff_hist, cpu); - break; -#endif -#ifdef CONFIG_INTERRUPT_OFF_HIST - case IRQSOFF_LATENCY: - hist = &per_cpu(irqsoff_hist, cpu); - break; -#endif -#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) - case PREEMPTIRQSOFF_LATENCY: - hist = &per_cpu(preemptirqsoff_hist, cpu); - break; -#endif -#ifdef CONFIG_WAKEUP_LATENCY_HIST - case WAKEUP_LATENCY: - hist = &per_cpu(wakeup_latency_hist, cpu); - mp = &per_cpu(wakeup_maxlatproc, cpu); - break; - case WAKEUP_LATENCY_SHAREDPRIO: - hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu); - mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu); - break; -#endif -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - case MISSED_TIMER_OFFSETS: - hist = &per_cpu(missed_timer_offsets, cpu); - mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu); - break; -#endif -#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - case TIMERANDWAKEUP_LATENCY: - hist = &per_cpu(timerandwakeup_latency_hist, cpu); - mp = &per_cpu(timerandwakeup_maxlatproc, cpu); - break; -#endif - } - - hist_reset(hist); -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - if (latency_type == WAKEUP_LATENCY || - latency_type == WAKEUP_LATENCY_SHAREDPRIO || - latency_type == MISSED_TIMER_OFFSETS || - latency_type == TIMERANDWAKEUP_LATENCY) - clear_maxlatprocdata(mp); -#endif - } - - return size; -} - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -static ssize_t -show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - unsigned long *this_pid = file->private_data; - - r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t do_pid(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - unsigned long pid; - unsigned long *this_pid = file->private_data; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = '\0'; - - if (kstrtoul(buf, 10, &pid)) - return -EINVAL; - - *this_pid = pid; - - return cnt; -} -#endif - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -static ssize_t -show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) -{ - int r; - struct maxlatproc_data *mp = file->private_data; - int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8); - unsigned long long t; - unsigned long usecs, secs; - char *buf; - - if (mp->pid == -1 || mp->current_pid == -1) { - buf = "(none)\n"; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, - strlen(buf)); - } - - buf = kmalloc(strmaxlen, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; - - t = ns2usecs(mp->timestamp); - usecs = do_div(t, USEC_PER_SEC); - secs = (unsigned long) t; - r = snprintf(buf, strmaxlen, - "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid, - MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm, - mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm, - secs, usecs); - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - kfree(buf); - return r; -} -#endif - -static ssize_t -show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) -{ - char buf[64]; - struct enable_data *ed = file->private_data; - int r; - - r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) -{ - char buf[64]; - long enable; - struct enable_data *ed = file->private_data; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - if (kstrtoul(buf, 10, &enable)) - return -EINVAL; - - if ((enable && ed->enabled) || (!enable && !ed->enabled)) - return cnt; - - if (enable) { - int ret; - - switch (ed->latency_type) { -#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) - case PREEMPTIRQSOFF_LATENCY: - ret = register_trace_preemptirqsoff_hist( - probe_preemptirqsoff_hist, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_preemptirqsoff_hist " - "to trace_preemptirqsoff_hist\n"); - return ret; - } - break; -#endif -#ifdef CONFIG_WAKEUP_LATENCY_HIST - case WAKEUP_LATENCY: - ret = register_trace_sched_wakeup( - probe_wakeup_latency_hist_start, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_wakeup_latency_hist_start " - "to trace_sched_wakeup\n"); - return ret; - } - ret = register_trace_sched_wakeup_new( - probe_wakeup_latency_hist_start, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_wakeup_latency_hist_start " - "to trace_sched_wakeup_new\n"); - unregister_trace_sched_wakeup( - probe_wakeup_latency_hist_start, NULL); - return ret; - } - ret = register_trace_sched_switch( - probe_wakeup_latency_hist_stop, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_wakeup_latency_hist_stop " - "to trace_sched_switch\n"); - unregister_trace_sched_wakeup( - probe_wakeup_latency_hist_start, NULL); - unregister_trace_sched_wakeup_new( - probe_wakeup_latency_hist_start, NULL); - return ret; - } - ret = register_trace_sched_migrate_task( - probe_sched_migrate_task, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_sched_migrate_task " - "to trace_sched_migrate_task\n"); - unregister_trace_sched_wakeup( - probe_wakeup_latency_hist_start, NULL); - unregister_trace_sched_wakeup_new( - probe_wakeup_latency_hist_start, NULL); - unregister_trace_sched_switch( - probe_wakeup_latency_hist_stop, NULL); - return ret; - } - break; -#endif -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - case MISSED_TIMER_OFFSETS: - ret = register_trace_hrtimer_interrupt( - probe_hrtimer_interrupt, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't assign " - "probe_hrtimer_interrupt " - "to trace_hrtimer_interrupt\n"); - return ret; - } - break; -#endif -#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - case TIMERANDWAKEUP_LATENCY: - if (!wakeup_latency_enabled_data.enabled || - !missed_timer_offsets_enabled_data.enabled) - return -EINVAL; - break; -#endif - default: - break; - } - } else { - switch (ed->latency_type) { -#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) - case PREEMPTIRQSOFF_LATENCY: - { - int cpu; - - unregister_trace_preemptirqsoff_hist( - probe_preemptirqsoff_hist, NULL); - for_each_online_cpu(cpu) { -#ifdef CONFIG_INTERRUPT_OFF_HIST - per_cpu(hist_irqsoff_counting, - cpu) = 0; -#endif -#ifdef CONFIG_PREEMPT_OFF_HIST - per_cpu(hist_preemptoff_counting, - cpu) = 0; -#endif -#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) - per_cpu(hist_preemptirqsoff_counting, - cpu) = 0; -#endif - } - } - break; -#endif -#ifdef CONFIG_WAKEUP_LATENCY_HIST - case WAKEUP_LATENCY: - { - int cpu; - - unregister_trace_sched_wakeup( - probe_wakeup_latency_hist_start, NULL); - unregister_trace_sched_wakeup_new( - probe_wakeup_latency_hist_start, NULL); - unregister_trace_sched_switch( - probe_wakeup_latency_hist_stop, NULL); - unregister_trace_sched_migrate_task( - probe_sched_migrate_task, NULL); - - for_each_online_cpu(cpu) { - per_cpu(wakeup_task, cpu) = NULL; - per_cpu(wakeup_sharedprio, cpu) = 0; - } - } -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - timerandwakeup_enabled_data.enabled = 0; -#endif - break; -#endif -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - case MISSED_TIMER_OFFSETS: - unregister_trace_hrtimer_interrupt( - probe_hrtimer_interrupt, NULL); -#ifdef CONFIG_WAKEUP_LATENCY_HIST - timerandwakeup_enabled_data.enabled = 0; -#endif - break; -#endif - default: - break; - } - } - ed->enabled = enable; - return cnt; -} - -static const struct file_operations latency_hist_reset_fops = { - .open = tracing_open_generic, - .write = latency_hist_reset, -}; - -static const struct file_operations enable_fops = { - .open = tracing_open_generic, - .read = show_enable, - .write = do_enable, -}; - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) -static const struct file_operations pid_fops = { - .open = tracing_open_generic, - .read = show_pid, - .write = do_pid, -}; - -static const struct file_operations maxlatproc_fops = { - .open = tracing_open_generic, - .read = show_maxlatproc, -}; -#endif - -#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) -static notrace void probe_preemptirqsoff_hist(void *v, int reason, - int starthist) -{ - int cpu = raw_smp_processor_id(); - int time_set = 0; - - if (starthist) { - cycle_t uninitialized_var(start); - - if (!preempt_count() && !irqs_disabled()) - return; - -#ifdef CONFIG_INTERRUPT_OFF_HIST - if ((reason == IRQS_OFF || reason == TRACE_START) && - !per_cpu(hist_irqsoff_counting, cpu)) { - per_cpu(hist_irqsoff_counting, cpu) = 1; - start = ftrace_now(cpu); - time_set++; - per_cpu(hist_irqsoff_start, cpu) = start; - } -#endif - -#ifdef CONFIG_PREEMPT_OFF_HIST - if ((reason == PREEMPT_OFF || reason == TRACE_START) && - !per_cpu(hist_preemptoff_counting, cpu)) { - per_cpu(hist_preemptoff_counting, cpu) = 1; - if (!(time_set++)) - start = ftrace_now(cpu); - per_cpu(hist_preemptoff_start, cpu) = start; - } -#endif - -#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) - if (per_cpu(hist_irqsoff_counting, cpu) && - per_cpu(hist_preemptoff_counting, cpu) && - !per_cpu(hist_preemptirqsoff_counting, cpu)) { - per_cpu(hist_preemptirqsoff_counting, cpu) = 1; - if (!time_set) - start = ftrace_now(cpu); - per_cpu(hist_preemptirqsoff_start, cpu) = start; - } -#endif - } else { - cycle_t uninitialized_var(stop); - -#ifdef CONFIG_INTERRUPT_OFF_HIST - if ((reason == IRQS_ON || reason == TRACE_STOP) && - per_cpu(hist_irqsoff_counting, cpu)) { - cycle_t start = per_cpu(hist_irqsoff_start, cpu); - - stop = ftrace_now(cpu); - time_set++; - if (start) { - long latency = ((long) (stop - start)) / - NSECS_PER_USECS; - - latency_hist(IRQSOFF_LATENCY, cpu, latency, 0, - stop, NULL); - } - per_cpu(hist_irqsoff_counting, cpu) = 0; - } -#endif - -#ifdef CONFIG_PREEMPT_OFF_HIST - if ((reason == PREEMPT_ON || reason == TRACE_STOP) && - per_cpu(hist_preemptoff_counting, cpu)) { - cycle_t start = per_cpu(hist_preemptoff_start, cpu); - - if (!(time_set++)) - stop = ftrace_now(cpu); - if (start) { - long latency = ((long) (stop - start)) / - NSECS_PER_USECS; - - latency_hist(PREEMPTOFF_LATENCY, cpu, latency, - 0, stop, NULL); - } - per_cpu(hist_preemptoff_counting, cpu) = 0; - } -#endif - -#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) - if ((!per_cpu(hist_irqsoff_counting, cpu) || - !per_cpu(hist_preemptoff_counting, cpu)) && - per_cpu(hist_preemptirqsoff_counting, cpu)) { - cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu); - - if (!time_set) - stop = ftrace_now(cpu); - if (start) { - long latency = ((long) (stop - start)) / - NSECS_PER_USECS; - - latency_hist(PREEMPTIRQSOFF_LATENCY, cpu, - latency, 0, stop, NULL); - } - per_cpu(hist_preemptirqsoff_counting, cpu) = 0; - } -#endif - } -} -#endif - -#ifdef CONFIG_WAKEUP_LATENCY_HIST -static DEFINE_RAW_SPINLOCK(wakeup_lock); -static notrace void probe_sched_migrate_task(void *v, struct task_struct *task, - int cpu) -{ - int old_cpu = task_cpu(task); - - if (cpu != old_cpu) { - unsigned long flags; - struct task_struct *cpu_wakeup_task; - - raw_spin_lock_irqsave(&wakeup_lock, flags); - - cpu_wakeup_task = per_cpu(wakeup_task, old_cpu); - if (task == cpu_wakeup_task) { - put_task_struct(cpu_wakeup_task); - per_cpu(wakeup_task, old_cpu) = NULL; - cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task; - get_task_struct(cpu_wakeup_task); - } - - raw_spin_unlock_irqrestore(&wakeup_lock, flags); - } -} - -static notrace void probe_wakeup_latency_hist_start(void *v, - struct task_struct *p, int success) -{ - unsigned long flags; - struct task_struct *curr = current; - int cpu = task_cpu(p); - struct task_struct *cpu_wakeup_task; - - raw_spin_lock_irqsave(&wakeup_lock, flags); - - cpu_wakeup_task = per_cpu(wakeup_task, cpu); - - if (wakeup_pid) { - if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) || - p->prio == curr->prio) - per_cpu(wakeup_sharedprio, cpu) = 1; - if (likely(wakeup_pid != task_pid_nr(p))) - goto out; - } else { - if (likely(!rt_task(p)) || - (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) || - p->prio > curr->prio) - goto out; - if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) || - p->prio == curr->prio) - per_cpu(wakeup_sharedprio, cpu) = 1; - } - - if (cpu_wakeup_task) - put_task_struct(cpu_wakeup_task); - cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p; - get_task_struct(cpu_wakeup_task); - cpu_wakeup_task->preempt_timestamp_hist = - ftrace_now(raw_smp_processor_id()); -out: - raw_spin_unlock_irqrestore(&wakeup_lock, flags); -} - -static notrace void probe_wakeup_latency_hist_stop(void *v, - struct task_struct *prev, struct task_struct *next) -{ - unsigned long flags; - int cpu = task_cpu(next); - long latency; - cycle_t stop; - struct task_struct *cpu_wakeup_task; - - raw_spin_lock_irqsave(&wakeup_lock, flags); - - cpu_wakeup_task = per_cpu(wakeup_task, cpu); - - if (cpu_wakeup_task == NULL) - goto out; - - /* Already running? */ - if (unlikely(current == cpu_wakeup_task)) - goto out_reset; - - if (next != cpu_wakeup_task) { - if (next->prio < cpu_wakeup_task->prio) - goto out_reset; - - if (next->prio == cpu_wakeup_task->prio) - per_cpu(wakeup_sharedprio, cpu) = 1; - - goto out; - } - - if (current->prio == cpu_wakeup_task->prio) - per_cpu(wakeup_sharedprio, cpu) = 1; - - /* - * The task we are waiting for is about to be switched to. - * Calculate latency and store it in histogram. - */ - stop = ftrace_now(raw_smp_processor_id()); - - latency = ((long) (stop - next->preempt_timestamp_hist)) / - NSECS_PER_USECS; - - if (per_cpu(wakeup_sharedprio, cpu)) { - latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop, - next); - per_cpu(wakeup_sharedprio, cpu) = 0; - } else { - latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next); -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - if (timerandwakeup_enabled_data.enabled) { - latency_hist(TIMERANDWAKEUP_LATENCY, cpu, - next->timer_offset + latency, next->timer_offset, - stop, next); - } -#endif - } - -out_reset: -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - next->timer_offset = 0; -#endif - put_task_struct(cpu_wakeup_task); - per_cpu(wakeup_task, cpu) = NULL; -out: - raw_spin_unlock_irqrestore(&wakeup_lock, flags); -} -#endif - -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST -static notrace void probe_hrtimer_interrupt(void *v, int cpu, - long long latency_ns, struct task_struct *curr, - struct task_struct *task) -{ - if (latency_ns <= 0 && task != NULL && rt_task(task) && - (task->prio < curr->prio || - (task->prio == curr->prio && - !cpumask_test_cpu(cpu, &task->cpus_allowed)))) { - long latency; - cycle_t now; - - if (missed_timer_offsets_pid) { - if (likely(missed_timer_offsets_pid != - task_pid_nr(task))) - return; - } - - now = ftrace_now(cpu); - latency = (long) div_s64(-latency_ns, NSECS_PER_USECS); - latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now, - task); -#ifdef CONFIG_WAKEUP_LATENCY_HIST - task->timer_offset = latency; -#endif - } -} -#endif - -static __init int latency_hist_init(void) -{ - struct dentry *latency_hist_root = NULL; - struct dentry *dentry; -#ifdef CONFIG_WAKEUP_LATENCY_HIST - struct dentry *dentry_sharedprio; -#endif - struct dentry *entry; - struct dentry *enable_root; - int i = 0; - struct hist_data *my_hist; - char name[64]; - char *cpufmt = "CPU%d"; -#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - char *cpufmt_maxlatproc = "max_latency-CPU%d"; - struct maxlatproc_data *mp = NULL; -#endif - - dentry = tracing_init_dentry(); - latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry); - enable_root = debugfs_create_dir("enable", latency_hist_root); - -#ifdef CONFIG_INTERRUPT_OFF_HIST - dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(irqsoff_hist, i), &latency_hist_fops); - my_hist = &per_cpu(irqsoff_hist, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - } - entry = debugfs_create_file("reset", 0644, dentry, - (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops); -#endif - -#ifdef CONFIG_PREEMPT_OFF_HIST - dentry = debugfs_create_dir(preemptoff_hist_dir, - latency_hist_root); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(preemptoff_hist, i), &latency_hist_fops); - my_hist = &per_cpu(preemptoff_hist, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - } - entry = debugfs_create_file("reset", 0644, dentry, - (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops); -#endif - -#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) - dentry = debugfs_create_dir(preemptirqsoff_hist_dir, - latency_hist_root); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops); - my_hist = &per_cpu(preemptirqsoff_hist, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - } - entry = debugfs_create_file("reset", 0644, dentry, - (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops); -#endif - -#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) - entry = debugfs_create_file("preemptirqsoff", 0644, - enable_root, (void *)&preemptirqsoff_enabled_data, - &enable_fops); -#endif - -#ifdef CONFIG_WAKEUP_LATENCY_HIST - dentry = debugfs_create_dir(wakeup_latency_hist_dir, - latency_hist_root); - dentry_sharedprio = debugfs_create_dir( - wakeup_latency_hist_dir_sharedprio, dentry); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(wakeup_latency_hist, i), - &latency_hist_fops); - my_hist = &per_cpu(wakeup_latency_hist, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - - entry = debugfs_create_file(name, 0444, dentry_sharedprio, - &per_cpu(wakeup_latency_hist_sharedprio, i), - &latency_hist_fops); - my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - - sprintf(name, cpufmt_maxlatproc, i); - - mp = &per_cpu(wakeup_maxlatproc, i); - entry = debugfs_create_file(name, 0444, dentry, mp, - &maxlatproc_fops); - clear_maxlatprocdata(mp); - - mp = &per_cpu(wakeup_maxlatproc_sharedprio, i); - entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp, - &maxlatproc_fops); - clear_maxlatprocdata(mp); - } - entry = debugfs_create_file("pid", 0644, dentry, - (void *)&wakeup_pid, &pid_fops); - entry = debugfs_create_file("reset", 0644, dentry, - (void *)WAKEUP_LATENCY, &latency_hist_reset_fops); - entry = debugfs_create_file("reset", 0644, dentry_sharedprio, - (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops); - entry = debugfs_create_file("wakeup", 0644, - enable_root, (void *)&wakeup_latency_enabled_data, - &enable_fops); -#endif - -#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - dentry = debugfs_create_dir(missed_timer_offsets_dir, - latency_hist_root); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(missed_timer_offsets, i), &latency_hist_fops); - my_hist = &per_cpu(missed_timer_offsets, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - - sprintf(name, cpufmt_maxlatproc, i); - mp = &per_cpu(missed_timer_offsets_maxlatproc, i); - entry = debugfs_create_file(name, 0444, dentry, mp, - &maxlatproc_fops); - clear_maxlatprocdata(mp); - } - entry = debugfs_create_file("pid", 0644, dentry, - (void *)&missed_timer_offsets_pid, &pid_fops); - entry = debugfs_create_file("reset", 0644, dentry, - (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops); - entry = debugfs_create_file("missed_timer_offsets", 0644, - enable_root, (void *)&missed_timer_offsets_enabled_data, - &enable_fops); -#endif - -#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \ - defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) - dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir, - latency_hist_root); - for_each_possible_cpu(i) { - sprintf(name, cpufmt, i); - entry = debugfs_create_file(name, 0444, dentry, - &per_cpu(timerandwakeup_latency_hist, i), - &latency_hist_fops); - my_hist = &per_cpu(timerandwakeup_latency_hist, i); - atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = LONG_MAX; - - sprintf(name, cpufmt_maxlatproc, i); - mp = &per_cpu(timerandwakeup_maxlatproc, i); - entry = debugfs_create_file(name, 0444, dentry, mp, - &maxlatproc_fops); - clear_maxlatprocdata(mp); - } - entry = debugfs_create_file("reset", 0644, dentry, - (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops); - entry = debugfs_create_file("timerandwakeup", 0644, - enable_root, (void *)&timerandwakeup_enabled_data, - &enable_fops); -#endif - return 0; -} - -device_initcall(latency_hist_init); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f9401ed..138077b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -442,7 +442,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, irq_flags, preempt_count()); if (!event) return 0; @@ -1509,7 +1509,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, struct task_struct *tsk = current; entry->preempt_count = pc & 0xff; - entry->preempt_lazy_count = preempt_lazy_count(); entry->pid = (tsk) ? tsk->pid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @@ -1519,10 +1518,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) | - (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0); - - entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0; + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -2412,17 +2408,14 @@ get_total_entries(struct trace_buffer *buf, static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _--------=> CPU# \n"); - seq_puts(m, "# / _-------=> irqs-off \n"); - seq_puts(m, "# | / _------=> need-resched \n"); - seq_puts(m, "# || / _-----=> need-resched_lazy \n"); - seq_puts(m, "# ||| / _----=> hardirq/softirq \n"); - seq_puts(m, "# |||| / _---=> preempt-depth \n"); - seq_puts(m, "# ||||| / _--=> preempt-lazy-depth\n"); - seq_puts(m, "# |||||| / _-=> migrate-disable \n"); - seq_puts(m, "# ||||||| / delay \n"); - seq_puts(m, "# cmd pid |||||||| time | caller \n"); - seq_puts(m, "# \\ / |||||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct trace_buffer *buf, struct seq_file *m) @@ -2446,16 +2439,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# _-------=> irqs-off \n"); - seq_puts(m, "# / _------=> need-resched \n"); - seq_puts(m, "# |/ _-----=> need-resched_lazy \n"); - seq_puts(m, "# ||/ _----=> hardirq/softirq \n"); - seq_puts(m, "# |||/ _---=> preempt-depth \n"); - seq_puts(m, "# ||||/ _--=> preempt-lazy-depth\n"); - seq_puts(m, "# ||||| / _-=> migrate-disable \n"); - seq_puts(m, "# |||||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||||| | |\n"); + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | |||| | |\n"); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 109291a..10c86fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -117,7 +117,6 @@ struct kretprobe_trace_entry_head { * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler - * NEED_RESCHED_LAZY - lazy reschedule is requested */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, @@ -125,7 +124,6 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_NEED_RESCHED_LAZY = 0x20, }; #define TRACE_BUF_SIZE 1024 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7531ded..bc1bd20 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); @@ -166,8 +160,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(unsigned short, migrate_disable); - __common_field(unsigned short, padding); return ret; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a746..d7d0b50 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2f4eb37..2aefbee 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -17,7 +17,6 @@ #include <linux/fs.h> #include "trace.h" -#include <trace/events/hist.h> static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -440,13 +439,11 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); - trace_preemptirqsoff_hist(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { - trace_preemptirqsoff_hist(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -456,7 +453,6 @@ EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { - trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -465,7 +461,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); - trace_preemptirqsoff_hist(IRQS_OFF, 1); } #else /* !CONFIG_PROVE_LOCKING */ @@ -491,7 +486,6 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void trace_hardirqs_on(void) { - trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -501,13 +495,11 @@ void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); - trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off); void trace_hardirqs_on_caller(unsigned long caller_addr) { - trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } @@ -517,7 +509,6 @@ void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); - trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -527,14 +518,12 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace_preemptirqsoff_hist(PREEMPT_ON, 0); if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace_preemptirqsoff_hist(PREEMPT_ON, 1); if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 46b6467..34e7cba 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -606,7 +606,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; - char need_resched_lazy; char irqs_off; int hardirq; int softirq; @@ -621,17 +620,14 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) '.'; need_resched = (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; - need_resched_lazy = - (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; hardsoft_irq = (hardirq && softirq) ? 'H' : hardirq ? 'h' : softirq ? 's' : '.'; - if (!trace_seq_printf(s, "%c%c%c%c", - irqs_off, need_resched, need_resched_lazy, - hardsoft_irq)) + if (!trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) @@ -639,16 +635,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (entry->preempt_lazy_count) - ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count); - else - ret = trace_seq_putc(s, '.'); - - if (entry->migrate_disable) - ret = trace_seq_printf(s, "%x", entry->migrate_disable); - else - ret = trace_seq_putc(s, '.'); - return ret; } diff --git a/kernel/user.c b/kernel/user.c index 2800008..5bbb919 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -154,11 +154,11 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save_nort(flags); + local_irq_save(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else - local_irq_restore_nort(flags); + local_irq_restore(flags); } struct user_struct *alloc_uid(kuid_t uid) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb113..6991139 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -146,7 +146,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -170,7 +170,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -193,7 +193,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -609,9 +609,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); diff --git a/kernel/wait-simple.c b/kernel/wait-simple.c deleted file mode 100644 index 7dfa86d..0000000 --- a/kernel/wait-simple.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Simple waitqueues without fancy flags and callbacks - * - * (C) 2011 Thomas Gleixner <tglx@linutronix.de> - * - * Based on kernel/wait.c - * - * For licencing details see kernel-base/COPYING - */ -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/wait-simple.h> - -/* Adds w to head->list. Must be called with head->lock locked. */ -static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w) -{ - list_add(&w->node, &head->list); - /* We can't let the condition leak before the setting of head */ - smp_mb(); -} - -/* Removes w from head->list. Must be called with head->lock locked. */ -static inline void __swait_dequeue(struct swaiter *w) -{ - list_del_init(&w->node); -} - -void __init_swait_head(struct swait_head *head, struct lock_class_key *key) -{ - raw_spin_lock_init(&head->lock); - lockdep_set_class(&head->lock, key); - INIT_LIST_HEAD(&head->list); -} -EXPORT_SYMBOL(__init_swait_head); - -void swait_prepare_locked(struct swait_head *head, struct swaiter *w) -{ - w->task = current; - if (list_empty(&w->node)) - __swait_enqueue(head, w); -} - -void swait_prepare(struct swait_head *head, struct swaiter *w, int state) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&head->lock, flags); - swait_prepare_locked(head, w); - __set_current_state(state); - raw_spin_unlock_irqrestore(&head->lock, flags); -} -EXPORT_SYMBOL(swait_prepare); - -void swait_finish_locked(struct swait_head *head, struct swaiter *w) -{ - __set_current_state(TASK_RUNNING); - if (w->task) - __swait_dequeue(w); -} - -void swait_finish(struct swait_head *head, struct swaiter *w) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - if (w->task) { - raw_spin_lock_irqsave(&head->lock, flags); - __swait_dequeue(w); - raw_spin_unlock_irqrestore(&head->lock, flags); - } -} -EXPORT_SYMBOL(swait_finish); - -unsigned int -__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num) -{ - struct swaiter *curr, *next; - int woken = 0; - - list_for_each_entry_safe(curr, next, &head->list, node) { - if (wake_up_state(curr->task, state)) { - __swait_dequeue(curr); - /* - * The waiting task can free the waiter as - * soon as curr->task = NULL is written, - * without taking any locks. A memory barrier - * is required here to prevent the following - * store to curr->task from getting ahead of - * the dequeue operation. - */ - smp_wmb(); - curr->task = NULL; - if (++woken == num) - break; - } - } - return woken; -} - -unsigned int -__swait_wake(struct swait_head *head, unsigned int state, unsigned int num) -{ - unsigned long flags; - int woken; - - if (!swaitqueue_active(head)) - return 0; - - raw_spin_lock_irqsave(&head->lock, flags); - woken = __swait_wake_locked(head, state, num); - raw_spin_unlock_irqrestore(&head->lock, flags); - return woken; -} -EXPORT_SYMBOL(__swait_wake); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 870b748..4431610 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -205,8 +205,6 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_RAW_SPINLOCK(watchdog_output_lock); - static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -241,19 +239,10 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - /* - * If early-printk is enabled then make sure we do not - * lock up in printk() and kill console logging: - */ - printk_kill(); - - if (hardlockup_panic) { + if (hardlockup_panic) panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - } else { - raw_spin_lock(&watchdog_output_lock); + else WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); - raw_spin_unlock(&watchdog_output_lock); - } __this_cpu_write(hard_watchdog_warn, true); return; @@ -357,7 +346,6 @@ static void watchdog_enable(unsigned int cpu) /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - hrtimer->irqsafe = 1; /* Enable the perf event */ watchdog_nmi_enable(cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9efb7ce..60fee69 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,8 +48,6 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> -#include <linux/locallock.h> -#include <linux/delay.h> #include "workqueue_internal.h" @@ -131,11 +129,11 @@ enum { * * PL: wq_pool_mutex protected. * - * PR: wq_pool_mutex protected for writes. RCU protected for reads. + * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * * WQ: wq->mutex protected. * - * WR: wq->mutex protected for writes. RCU protected for reads. + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -180,7 +178,7 @@ struct worker_pool { atomic_t nr_running ____cacheline_aligned_in_smp; /* - * Destruction of pool is RCU protected to allow dereferences + * Destruction of pool is sched-RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; @@ -209,7 +207,7 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be + * itself is also sched-RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; @@ -325,8 +323,6 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); -static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock); - static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); @@ -335,14 +331,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ lockdep_is_held(&wq_pool_mutex), \ - "RCU or wq_pool_mutex should be held") + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_held() || \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ lockdep_is_held(&wq->mutex), \ - "RCU or wq->mutex should be held") + "sched RCU or wq->mutex should be held") #ifdef CONFIG_LOCKDEP #define assert_manager_or_pool_lock(pool) \ @@ -364,7 +360,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_pool_mutex held or RCU read + * This must be called either with wq_pool_mutex held or sched RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * @@ -397,7 +393,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with wq->mutex held or RCU read locked. + * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -545,7 +541,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called either with pwq_lock held or RCU read locked. + * This must be called either with pwq_lock held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -649,8 +645,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under RCU read lock. As such, this function should be - * called under wq_pool_mutex or inside of a rcu_read_lock() region. + * access under sched-RCU read lock. As such, this function should be + * called under wq_pool_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -808,31 +804,44 @@ static void wake_up_worker(struct worker_pool *pool) } /** - * wq_worker_running - a worker is running again - * @task: task returning from sleep + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. * - * This function is called when a worker returns from schedule() + * CONTEXT: + * spin_lock_irq(rq->lock) */ -void wq_worker_running(struct task_struct *task) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); - if (!worker->sleeping) - return; - if (!(worker->flags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) { + WARN_ON_ONCE(worker->pool->cpu != cpu); atomic_inc(&worker->pool->nr_running); - worker->sleeping = 0; + } } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * This function is called from schedule() when a busy worker is - * going to sleep. + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * Return: + * Worker task on @cpu to wake up, %NULL if none. */ -void wq_worker_sleeping(struct task_struct *task) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { - struct worker *next, *worker = kthread_data(task); + struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; /* @@ -841,15 +850,14 @@ void wq_worker_sleeping(struct task_struct *task) * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) - return; + return NULL; pool = worker->pool; - if (WARN_ON_ONCE(worker->sleeping)) - return; + /* this can only happen on the local cpu */ + if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + return NULL; - worker->sleeping = 1; - spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). @@ -862,12 +870,9 @@ void wq_worker_sleeping(struct task_struct *task) * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) { - next = first_worker(pool); - if (next) - wake_up_process(next->task); - } - spin_unlock_irq(&pool->lock); + !list_empty(&pool->worklist)) + to_wakeup = first_worker(pool); + return to_wakeup ? to_wakeup->task : NULL; } /** @@ -1074,12 +1079,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* - * As both pwqs and pools are RCU protected, the + * As both pwqs and pools are sched-RCU protected, the * following lock operations are safe. */ - local_spin_lock_irq(pendingb_lock, &pwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock); + spin_unlock_irq(&pwq->pool->lock); } } @@ -1181,7 +1186,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, struct worker_pool *pool; struct pool_workqueue *pwq; - local_lock_irqsave(pendingb_lock, *flags); + local_irq_save(*flags); /* try to steal the timer if it exists */ if (is_dwork) { @@ -1200,7 +1205,6 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; - rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1239,16 +1243,14 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); - rcu_read_unlock(); return 1; } spin_unlock(&pool->lock); fail: - rcu_read_unlock(); - local_unlock_irqrestore(pendingb_lock, *flags); + local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; - cpu_chill(); + cpu_relax(); return -EAGAIN; } @@ -1317,7 +1319,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ - WARN_ON_ONCE_NONRT(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); debug_work_activate(work); @@ -1325,8 +1327,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; - - rcu_read_lock(); retry: if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); @@ -1383,8 +1383,10 @@ retry: /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); - if (WARN_ON(!list_empty(&work->entry))) - goto out; + if (WARN_ON(!list_empty(&work->entry))) { + spin_unlock(&pwq->pool->lock); + return; + } pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); @@ -1400,9 +1402,7 @@ retry: insert_work(pwq, work, worklist, work_flags); -out: spin_unlock(&pwq->pool->lock); - rcu_read_unlock(); } /** @@ -1422,14 +1422,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, bool ret = false; unsigned long flags; - local_lock_irqsave(pendingb_lock,flags); + local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); ret = true; } - local_unlock_irqrestore(pendingb_lock, flags); + local_irq_restore(flags); return ret; } EXPORT_SYMBOL(queue_work_on); @@ -1496,14 +1496,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, unsigned long flags; /* read the comment in __queue_work() */ - local_lock_irqsave(pendingb_lock, flags); + local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } - local_unlock_irqrestore(pendingb_lock, flags); + local_irq_restore(flags); return ret; } EXPORT_SYMBOL(queue_delayed_work_on); @@ -1538,7 +1538,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, if (likely(ret >= 0)) { __queue_delayed_work(cpu, wq, dwork, delay); - local_unlock_irqrestore(pendingb_lock, flags); + local_irq_restore(flags); } /* -ENOENT from try_to_grab_pending() becomes %true */ @@ -2809,14 +2809,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) might_sleep(); - rcu_read_lock(); + local_irq_disable(); pool = get_work_pool(work); if (!pool) { - rcu_read_unlock(); + local_irq_enable(); return false; } - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -2843,11 +2843,10 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) else lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); - rcu_read_unlock(); + return true; already_gone: spin_unlock_irq(&pool->lock); - rcu_read_unlock(); return false; } @@ -2901,7 +2900,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) /* tell other tasks trying to grab @work to back off */ mark_work_canceling(work); - local_unlock_irqrestore(pendingb_lock, flags); + local_irq_restore(flags); flush_work(work); clear_work_data(work); @@ -2946,10 +2945,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); */ bool flush_delayed_work(struct delayed_work *dwork) { - local_lock_irq(pendingb_lock); + local_irq_disable(); if (del_timer_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); - local_unlock_irq(pendingb_lock); + local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); @@ -2984,7 +2983,7 @@ bool cancel_delayed_work(struct delayed_work *dwork) set_work_pool_and_clear_pending(&dwork->work, get_work_pool_id(&dwork->work)); - local_unlock_irqrestore(pendingb_lock, flags); + local_irq_restore(flags); return ret; } EXPORT_SYMBOL(cancel_delayed_work); @@ -3170,8 +3169,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - get_online_cpus(); - rcu_read_lock(); + rcu_read_lock_sched(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, @@ -3179,8 +3177,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - put_online_cpus(); + rcu_read_unlock_sched(); return written; } @@ -3546,7 +3543,7 @@ static void rcu_free_pool(struct rcu_head *rcu) * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * - * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). @@ -3593,8 +3590,8 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); - /* RCU protected to allow dereferences from get_work_pool() */ - call_rcu(&pool->rcu, rcu_free_pool); + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); } /** @@ -3707,7 +3704,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); - call_rcu(&pwq->rcu, rcu_free_pwq); + call_rcu_sched(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one @@ -4420,8 +4417,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock(); - preempt_disable(); + rcu_read_lock_sched(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); @@ -4432,8 +4428,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); - preempt_enable(); - rcu_read_unlock(); + rcu_read_unlock_sched(); return ret; } @@ -4459,15 +4454,16 @@ unsigned int work_busy(struct work_struct *work) if (work_pending(work)) ret |= WORK_BUSY_PENDING; - rcu_read_lock(); + local_irq_save(flags); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock(&pool->lock); } - rcu_read_unlock(); + local_irq_restore(flags); + return ret; } EXPORT_SYMBOL_GPL(work_busy); @@ -4920,16 +4916,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - rcu_read_lock(); + rcu_read_lock_sched(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - rcu_read_unlock(); + rcu_read_unlock_sched(); goto out_unlock; } } - rcu_read_unlock(); + rcu_read_unlock_sched(); } out_unlock: mutex_unlock(&wq_pool_mutex); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 2bb5b5a..7e2204d 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -41,7 +41,6 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -67,7 +66,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/core.c and workqueue.c. */ -void wq_worker_running(struct task_struct *task); -void wq_worker_sleeping(struct task_struct *task); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |