From b74e6278fd6db5848163ccdc6e9d8eb6efdee9bd Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 18 Dec 2014 12:44:30 -0600 Subject: sched: Fix KMALLOC_MAX_SIZE overflow during cpumask allocation When allocating space for load_balance_mask, in sched_init, when CPUMASK_OFFSTACK is set, we've managed to spill over KMALLOC_MAX_SIZE on our 6144 core machine. The patch below breaks up the allocations so that they don't overflow the max alloc size. It also allocates the masks on the the node from which they'll most commonly be accessed, to minimize remote accesses on NUMA machines. Suggested-by: George Beshers Signed-off-by: Alex Thorlton Cc: George Beshers Cc: Russ Anderson Cc: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418928270-148543-1-git-send-email-athorlton@sgi.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5797b7..c0accc0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7113,9 +7113,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -7135,13 +7132,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); -- cgit v0.10.2 From 536ebe9ca999f6d0903d91698678ccc1742e8dd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Dec 2014 16:28:38 +0100 Subject: sched, fanotify: Deal with nested sleeps As per e23738a7300a ("sched, inotify: Deal with nested sleeps"). fanotify_read is a wait loop with sleeps in. Wait loops rely on task_struct::state and sleeps do too, since that's the only means of actually sleeping. Therefore the nested sleeps destroy the wait loop state and the wait loop breaks the sleep functions that assume TASK_RUNNING (mutex_lock). Fix this by using the new woken_wake_function and wait_woken() stuff, which registers wakeups in wait and thereby allows shrinking the task_state::state changes to the actual sleep part. Reported-by: Yuanhan Liu Reported-by: Sedat Dilek Signed-off-by: Peter Zijlstra (Intel) Cc: Takashi Iwai Cc: Al Viro Cc: Eric Paris Cc: Linus Torvalds Cc: Eric Paris Link: http://lkml.kernel.org/r/20141216152838.GZ3337@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c991616..bff8567 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, struct fsnotify_event *kevent; char __user *start; int ret; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); + add_wait_queue(&group->notification_waitq, &wait); while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); - mutex_lock(&group->notification_mutex); kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); @@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (start != buf) break; - schedule(); + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } @@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, buf += ret; count -= ret; } + remove_wait_queue(&group->notification_waitq, &wait); - finish_wait(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; -- cgit v0.10.2 From 32a8df4e0b33fccc9715213b382160415b5c4008 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Fri, 19 Dec 2014 08:29:56 +0800 Subject: sched: Fix odd values in effective_load() calculations In effective_load, we have (long w * unsigned long tg->shares) / long W, when w is negative, it is cast to unsigned long and hence the product is insanely large. Fix this by casting tg->shares to long. Reported-by: Sasha Levin Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Dave Jones Cc: Andrey Ryabinin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141219002956.GA25405@intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df2cdf7..6b99659 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4424,7 +4424,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; -- cgit v0.10.2 From 6a503c3be937d275113b702e0421e5b0720abe8a Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 17 Dec 2014 11:50:31 +0100 Subject: sched/deadline: Fix migration of SCHED_DEADLINE tasks According to global EDF, tasks should be migrated between runqueues without checking if their scheduling deadlines and runtimes are valid. However, SCHED_DEADLINE currently performs such a check: a migration happens doing: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); which ends up calling dequeue_task_dl(), setting the new CPU, and then calling enqueue_task_dl(). enqueue_task_dl() then calls enqueue_dl_entity(), which calls update_dl_entity(), which can modify scheduling deadline and runtime, breaking global EDF scheduling. As a result, some of the properties of global EDF are not respected: for example, a taskset {(30, 80), (40, 80), (120, 170)} scheduled on two cores can have unbounded response times for the third task even if 30/80+40/80+120/170 = 1.5809 < 2 This can be fixed by invoking update_dl_entity() only in case of wakeup, or if this is a new SCHED_DEADLINE task. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Cc: Dario Faggioli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418813432-20797-2-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e5db8c6..55af498 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -826,10 +826,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } -- cgit v0.10.2 From 269ad8015a6b2bb1cf9e684da4921eb6fa0a0c88 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 17 Dec 2014 11:50:32 +0100 Subject: sched/deadline: Avoid double-accounting in case of missed deadlines The dl_runtime_exceeded() function is supposed to ckeck if a SCHED_DEADLINE task must be throttled, by checking if its current runtime is <= 0. However, it also checks if the scheduling deadline has been missed (the current time is larger than the current scheduling deadline), further decreasing the runtime if this happens. This "double accounting" is wrong: - In case of partitioned scheduling (or single CPU), this happens if task_tick_dl() has been called later than expected (due to small HZ values). In this case, the current runtime is also negative, and replenish_dl_entity() can take care of the deadline miss by recharging the current runtime to a value smaller than dl_runtime - In case of global scheduling on multiple CPUs, scheduling deadlines can be missed even if the task did not consume more runtime than expected, hence penalizing the task is wrong This patch fix this problem by throttling a SCHED_DEADLINE task only when its runtime becomes negative, and not modifying the runtime Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Cc: Dario Faggioli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418813432-20797-3-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 55af498..b52092f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -570,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -- cgit v0.10.2 From 7f1a169b88f513e32a432ca0f85bfd282d117bd6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 25 Dec 2014 15:51:21 +0900 Subject: sched/fair: Fix RCU stall upon -ENOMEM in sched_create_group() When alloc_fair_sched_group() in sched_create_group() fails, free_sched_group() is called, and free_fair_sched_group() is called by free_sched_group(). Since destroy_cfs_bandwidth() is called by free_fair_sched_group() without calling init_cfs_bandwidth(), RCU stall occurs at hrtimer_cancel(): INFO: rcu_sched self-detected stall on CPU { 1} (t=60000 jiffies g=13074 c=13073 q=0) Task dump for CPU 1: (fprintd) R running task 0 6249 1 0x00000088 ... Call Trace: [] sched_show_task+0xa8/0x110 [] dump_cpu_task+0x3d/0x50 [] rcu_dump_cpu_stacks+0x90/0xd0 [] rcu_check_callbacks+0x491/0x700 [] update_process_times+0x4b/0x80 [] tick_sched_handle.isra.20+0x36/0x50 [] tick_sched_timer+0x42/0x70 [] __run_hrtimer+0x69/0x1a0 [] ? tick_sched_handle.isra.20+0x50/0x50 [] hrtimer_interrupt+0xef/0x230 [] local_apic_timer_interrupt+0x3b/0x70 [] smp_apic_timer_interrupt+0x45/0x60 [] apic_timer_interrupt+0x6d/0x80 [] ? lock_hrtimer_base.isra.23+0x18/0x50 [] ? __kmalloc+0x211/0x230 [] hrtimer_try_to_cancel+0x22/0xd0 [] ? __kmalloc+0x211/0x230 [] hrtimer_cancel+0x22/0x30 [] free_fair_sched_group+0x25/0xd0 [] free_sched_group+0x16/0x40 [] sched_create_group+0x4b/0x80 [] sched_autogroup_create_attach+0x43/0x1c0 [] sys_setsid+0x7c/0x110 [] system_call_fastpath+0x12/0x17 Check whether init_cfs_bandwidth() was called before calling destroy_cfs_bandwidth(). Signed-off-by: Tetsuo Handa [ Move the check into destroy_cfs_bandwidth() to aid compilability. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Linus Torvalds Link: http://lkml.kernel.org/r/201412252210.GCC30204.SOMVFFOtQJFLOH@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b99659..40667cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4005,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } -- cgit v0.10.2