diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/Makefile | 6 | ||||
-rw-r--r-- | kernel/bpf/core.c | 9 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 3 | ||||
-rw-r--r-- | kernel/context_tracking.c | 40 | ||||
-rw-r--r-- | kernel/cpu.c | 14 | ||||
-rw-r--r-- | kernel/events/core.c | 15 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 7 | ||||
-rw-r--r-- | kernel/freezer.c | 9 | ||||
-rw-r--r-- | kernel/futex.c | 36 | ||||
-rw-r--r-- | kernel/gcov/Kconfig | 2 | ||||
-rw-r--r-- | kernel/kmod.c | 76 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 8 | ||||
-rw-r--r-- | kernel/power/process.c | 57 | ||||
-rw-r--r-- | kernel/power/qos.c | 27 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 15 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 1 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 33 | ||||
-rw-r--r-- | kernel/sched/core.c | 47 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 41 | ||||
-rw-r--r-- | kernel/sched/fair.c | 21 | ||||
-rw-r--r-- | kernel/sysctl.c | 3 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 2 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 1 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 54 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 8 |
26 files changed, 356 insertions, 181 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dc5c775..17ea6d4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4542723..0daf7f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ -obj-y := core.o syscall.o verifier.o - +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o ifdef CONFIG_TEST_BPF -obj-y += test_stub.o +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0c30c5..d6594e4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp) schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 801f5f3..9f81818 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(&old->regs[i], &cur->regs[i], sizeof(old->regs[0])) != 0) { if (old->regs[i].type == NOT_INIT || - old->regs[i].type == UNKNOWN_VALUE) + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) continue; return false; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985..937ecdf 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/cpu.c b/kernel/cpu.c index 356450f..90a3d01 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -113,7 +115,11 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -155,6 +161,12 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + if (atomic_read(&cpu_hotplug.puts_pending)) { + int delta; + + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/kernel/events/core.c b/kernel/events/core.c index 1425d07..2b02c9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0..9803a66 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aa..a8900a3 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index f3a3a07..63678b5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key) futex_get_mm(key); /* implies MB (B) */ break; default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { @@ -641,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1521,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1619,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1634,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1710,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1727,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index cf66c5c..3b74087 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e04..80f7a6d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (wait != UMH_WAIT_PROC) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } @@ -258,26 +281,6 @@ static int call_helper(void *data) return ____call_usermodehelper(data); } -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) kmod_thread_locker = NULL; } - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } - sub_info->complete = &done; + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79..1f35a34 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b32322..5a6ec86 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + bool ret; + + read_lock(&tasklist_lock); + ret = __check_frozen_processes(); + read_unlock(&tasklist_lock); + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,11 +157,25 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + } else { + printk("done."); + } } printk("\n"); BUG_ON(in_atomic()); @@ -191,11 +230,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -218,10 +257,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b770..5f4c006 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e472..9815447 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d037646..bbdc45d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd45..c1d7f27 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2050,6 +2050,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } /* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4499950..240157c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577..5285332 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); @@ -532,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf..34baa60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); /* * "imp" is the fault differential for the source task between the @@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d..15f2511 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19..5544990 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463a..31ea01f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb186b9..31c90fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) * when we are adding another op to the rec or removing the * current one. Thus, if the op is being added, we can * ignore it because it hasn't attached itself to the rec - * yet. That means we just need to find the op that has a - * trampoline and is not beeing added. + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. */ do_for_each_ftrace_op(op, ftrace_ops_list) { @@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) if (op->flags & FTRACE_OPS_FL_ADDING) continue; + /* - * If the ops is not being added and has a trampoline, - * then it must be the one that we want! + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. */ - if (hash_contains_ip(ip, op->func_hash)) - return op; - - /* If the ops is being modified, it may be in the old hash. */ if ((op->flags & FTRACE_OPS_FL_MODIFYING) && hash_contains_ip(ip, &op->old_hash)) return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; } while_for_each_ftrace_op(op); @@ -2293,10 +2307,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } -static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3340,7 +3357,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3365,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3477,13 +3495,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(old_hash); + if (!ret) free_ftrace_hash_rcu(old_hash); else count = ret; - __enable_ftrace_function_probe(); - out_unlock: mutex_unlock(&ftrace_lock); out: @@ -3764,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3813,7 +3833,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops); + ftrace_ops_update_code(ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4058,7 +4078,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops); + ftrace_ops_update_code(iter->ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4dc8b79..29228c4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; |