From cadefd3d6cc914d95163ba1eda766bfe7ce1e5b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Feb 2014 10:40:35 +0100 Subject: sched: Make scale_rt_power() deal with backward clocks Mike reported that, while unlikely, its entirely possible for scale_rt_power() to see the time go backwards. This yields rather 'interesting' results. So like all other sites that deal with clocks; make this one ignore backward clock movement too. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140227094035.GZ9987@twins.programming.kicks-ass.net Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd9..5e157f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5564,6 +5564,7 @@ static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5572,7 +5573,11 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ -- cgit v0.10.2 From 27e4f9d0012a9bb7011aade862f08679d2921ab0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 12:50:34 +0200 Subject: sched/wait: Explain the shadowing and type inconsistencies Stick in a comment before someone else tries to fix the sparse warning this generates. Requested-by: Andrew Morton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-o2ro6f3vkxklni0bc8f7m68s@git.kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/wait.h b/include/linux/wait.h index 559044c..2b563a1 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -191,11 +191,23 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +/* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ wait_queue_t __wait; \ - long __ret = ret; \ + long __ret = ret; /* explicit shadow */ \ \ INIT_LIST_HEAD(&__wait.task_list); \ if (exclusive) \ -- cgit v0.10.2 From 10447917551e0fffb8d1892d46e633c3e0a9c1ec Mon Sep 17 00:00:00 2001 From: Kirill V Tkhai Date: Wed, 12 Mar 2014 06:18:33 -0400 Subject: sched/rt: Do not try to push tasks if pinned task switches to RT Just switched pinned task is not able to be pushed. If the rq had had several RT tasks before they have already been considered as candidates to be pushed (or pulled). Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140312061833.3a43aa64@gandalf.local.home Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd2267a..1e4992e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1892,9 +1892,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ - rq != task_rq(p)) + push_rt_task(rq) && rq != task_rq(p)) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) -- cgit v0.10.2 From 8698a745d800c59cd5a576398bdeccd578ac66f1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Mar 2014 18:09:12 +0800 Subject: sched, treewide: Replace hardcoded nice values with MIN_NICE/MAX_NICE Replace various -20/+19 hardcoded nice values with MIN_NICE/MAX_NICE. Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ff13819fd09b7a5dba5ab5ae797f2e7019bdfa17.1394532288.git.yangds.fnst@cn.fujitsu.com Cc: devel@driverdev.osuosl.org Cc: devicetree@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Cc: linux390@de.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: nbd-general@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: openipmi-developer@lists.sourceforge.net Cc: qla2xxx-upstream@qlogic.com Cc: linux-arch@vger.kernel.org [ Consolidated the patches, twiddled the changelog. ] Signed-off-by: Ingo Molnar diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 66e8c3b..c8bf270 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -548,7 +548,7 @@ static int loop_thread(void *data) struct loop_device *lo = data; struct bio *bio; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db..2a1f26b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -533,7 +533,7 @@ static int nbd_thread(void *data) struct nbd_device *nbd = data; struct request *req; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ wait_event_interruptible(nbd->waiting_wq, diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a2af73d..ef166ad 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar) struct packet_data *pkt; long min_sleep_time, residue; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_freezable(); for (;;) { diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index b7efd3c..0f20d36 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -998,7 +998,7 @@ static int ipmi_thread(void *data) struct timespec busy_until; ipmi_si_set_not_busy(&busy_until); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { int busy_wait; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ab3baa7..8eec165 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data) int requests; struct ap_device *ap_dev; - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (1) { if (ap_suspend_flag) return 0; diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 6287f6a..3455cc5 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg) struct fcoe_percpu_s *bg = arg; struct sk_buff *skb; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); @@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg) struct bnx2fc_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index b5ffd28..d6d491c 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg) struct bnx2i_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop()) { spin_lock_bh(&p->p_work_lock); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index f317000..843a679 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg) skb_queue_head_init(&tmp); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); retry: while (!kthread_should_stop()) { diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 23f5ba5..8dd4768 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data) struct ibmvfc_host *vhost = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(vhost->work_wait_q, diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index fa76440..2ebfb2b 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data) struct ibmvscsi_host_data *hostdata = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(hostdata->work_wait_q, diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 59b51c5..294c072 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -731,7 +731,7 @@ lpfc_do_work(void *p) struct lpfc_hba *phba = p; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); current->flags |= PF_NOFREEZE; phba->data_flags = 0; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 19e99cc..afc8481 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data) ha = (struct qla_hw_data *)data; base_vha = pci_get_drvdata(ha->pdev); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index cfe4bc8..179b21b 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -441,7 +441,7 @@ static void binder_set_nice(long nice) "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); - if (min_nice < 20) + if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index f78eda2..d4c2cd9 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -407,7 +407,7 @@ static int loop_thread(void *data) int refcheck; int ret = 0; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); lo->lo_state = LLOOP_BOUND; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bf482df..7303929 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ o2nm_depend_this_node(); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a1..23343be 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg) static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af..c30c01b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -100,10 +100,10 @@ enum { /* * Rescue workers are used only on emergencies and shared by - * all cpus. Give -20. + * all cpus. Give MIN_NICE. */ - RESCUER_NICE_LEVEL = -20, - HIGHPRI_NICE_LEVEL = -20, + RESCUER_NICE_LEVEL = MIN_NICE, + HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655..dcdb6f9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2803,7 +2803,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); -- cgit v0.10.2 From 22abdef37cebcdd4933c72339401a174b7d87768 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:49 +0400 Subject: sched/rt: Sum number of all children tasks in hierarhy at ->rt_nr_running {inc,dec}_rt_tasks() used to count entities which are directly queued on the rt_rq. If an entity was not a task (i.e., it is some queue), its children were not counted. There is no problem here, but now we want to count number of all tasks which are actually queued under the rt_rq in all the hierarchy (except throttled rt queues). Empty queues are not able to be queued and all of the places, which use ->rt_nr_running, just compare it with zero, so we do not break anything here. Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835289.18748.31.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org [ Twiddled the changelog. ] Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e4992e..6892ed7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1045,12 +1045,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1062,7 +1073,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); -- cgit v0.10.2 From 653d07a6989a9a4166dcd1025aa252b3605737fd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:55 +0400 Subject: sched/rt: Add accessors rq_of_rt_se() Two accessors for RT_GROUP_SCHED and !RT_GROUP_SCHED cases. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835295.18748.32.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6892ed7..f6aa3cd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -112,6 +112,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_se->rt_rq; + + return rt_rq->rq; +} + void free_rt_sched_group(struct task_group *tg) { int i; @@ -211,10 +218,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } -- cgit v0.10.2 From f4ebcbc0d7e009783256c9daf76bc4b90e645c14 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:00 +0400 Subject: sched/rt: Substract number of tasks of throttled queues from rq->nr_running Now rq->rt becomes to be able to be in dequeued or enqueued state. We add new member rt_rq->rt_queued, which is used to indicate this. The member is used only for top queue rq->rt_rq. The goal is to fit generic scheme which is used in deadline and fair classes, i.e. throttled rt_rq's rt_nr_running is beeing substracted from rq->nr_running. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835300.18748.33.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f6aa3cd..2add019 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; @@ -404,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq) } #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); + static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -465,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -479,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) + if (!rt_se) + dequeue_top_rt_rq(rt_rq); + else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); } @@ -545,12 +555,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_task(rq->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { + dequeue_top_rt_rq(rt_rq); } static inline const struct cpumask *sched_rt_period_mask(void) @@ -935,6 +951,38 @@ static void update_curr_rt(struct rq *rq) } } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + rq->nr_running -= rt_rq->rt_nr_running; + rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + return; + + rq->nr_running += rt_rq->rt_nr_running; + rt_rq->rt_queued = 1; +} + #if defined CONFIG_SMP static void @@ -1143,6 +1191,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) back = rt_se; } + dequeue_top_rt_rq(rt_rq_of_se(back)); + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); @@ -1151,13 +1201,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); + enqueue_top_rt_rq(&rq->rt); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) { @@ -1166,6 +1221,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1183,8 +1239,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1195,8 +1249,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_nr_running(rq); } /* @@ -1401,10 +1453,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) + if (!rt_rq->rt_queued) return NULL; put_prev_task(rq, prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492..c8d9ee4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,8 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + int rt_queued; + int rt_throttled; u64 rt_time; u64 rt_runtime; -- cgit v0.10.2 From 46383648b3c769fa74794ae6425ab993fc113bdb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:07 +0400 Subject: sched: Revert commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()") This reverts commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()"), which is not necessary after ("sched/rt: Substract number of tasks of throttled queues from rq->nr_running"). Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy [conflict resolution with stop task checking patch] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835307.18748.34.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e157f1..43232b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6732,10 +6732,7 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running && - ((this_rq->stop && this_rq->stop->on_rq) || - this_rq->dl.dl_nr_running || - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; if (pulled_task) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2add019..7795e29 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -493,6 +493,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -569,6 +574,11 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_top_rt_rq(rt_rq); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} + static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c8d9ee4..b2cbe81 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -425,18 +425,6 @@ struct rt_rq { #endif }; -#ifdef CONFIG_RT_GROUP_SCHED -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} -#else -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} -#endif - /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v0.10.2 From 08f8aeb55d7727d644dbbbbfb798fe937d47751d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Apr 2014 14:27:25 +0200 Subject: sched: Remove set_need_resched() The last user is gone now, so we can safely remove this function. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index fddbe20..cb0cec9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -104,20 +104,6 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) -static inline __deprecated void set_need_resched(void) -{ - /* - * Use of this function in deprecated. - * - * As of this writing there are only a few users in the DRM tree left - * all of which are wrong and can be removed without causing too much - * grief. - * - * The DRM people are aware and are working on removing the last few - * instances. - */ -} - #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK -- cgit v0.10.2 From 52c324f8a87b336496d0f5e9d8dff1aa32bb08cd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:13:47 +0200 Subject: cpuidle: Combine cpuidle_enabled() with cpuidle_select() Since both cpuidle_enabled() and cpuidle_select() are only called by cpuidle_idle_call(), it is not really useful to keep them separate and combining them will help to avoid complicating cpuidle_idle_call() even further if governors are changed to return error codes sometimes. This code modification shouldn't lead to any functional changes. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8236746..f38359f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -65,26 +65,6 @@ int cpuidle_play_dead(void) } /** - * cpuidle_enabled - check if the cpuidle framework is ready - * @dev: cpuidle device for this cpu - * @drv: cpuidle driver for this cpu - * - * Return 0 on success, otherwise: - * -NODEV : the cpuidle framework is not available - * -EBUSY : the cpuidle framework is not initialized - */ -int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) -{ - if (off || !initialized) - return -ENODEV; - - if (!drv || !dev || !dev->enabled) - return -EBUSY; - - return 0; -} - -/** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu * @drv: cpuidle driver for this cpu @@ -138,6 +118,12 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + if (off || !initialized) + return -ENODEV; + + if (!drv || !dev || !dev->enabled) + return -EBUSY; + return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index b0238cb..a8d5bd3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -120,8 +120,6 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); -extern int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter(struct cpuidle_driver *drv, @@ -149,9 +147,6 @@ extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } -static inline int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev) -{return -ENODEV; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a..a8f1224 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -101,19 +101,13 @@ static int cpuidle_idle_call(void) rcu_idle_enter(); /* - * Check if the cpuidle framework is ready, otherwise fallback - * to the default arch specific idle method + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch specific idle method on errors. */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev); + ret = next_state; + if (ret >= 0) { /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get @@ -140,7 +134,7 @@ static int cpuidle_idle_call(void) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); - if (!ret) { + if (ret >= 0) { trace_cpu_idle_rcuidle(next_state, dev->cpu); /* @@ -175,7 +169,7 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) + if (ret < 0) arch_cpu_idle(); __current_set_polling(); -- cgit v0.10.2 From 3836785a1bdcd6706c68ad46bf53adc0b057b310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:14:04 +0200 Subject: cpuidle / menu: Return (-1) if there are no suitable states If there is a PM QoS latency limit and all of the sufficiently shallow C-states are disabled, the cpuidle menu governor returns 0 which on some systems is CPUIDLE_DRIVER_STATE_START and shouldn't be returned if that C-state has been disabled. Fix the issue by modifying the menu governor to return (-1) in such situations. Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 71b5232..3ca15a8 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->needs_update = 0; } - data->last_state_idx = 0; + data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) -- cgit v0.10.2 From bed4d597a0f99b380d24ab3a9da47b62cbf1ad0e Mon Sep 17 00:00:00 2001 From: Chander Kashyap Date: Tue, 22 Apr 2014 18:08:04 +0530 Subject: cpuidle / menu: move repeated correction factor check to init In menu_select function we check for correction factor every time. If it is zero we are initializing to unity. Hence move it to init function and initialise by unity, hence avoid repeated comparisons. Signed-off-by: Chander Kashyap Reviewed-by: Tuukka Tikkanen Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 3ca15a8..c4f80c1 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -311,13 +311,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->bucket = which_bucket(data->next_timer_us); /* - * if the correction factor is 0 (eg first time init or cpu hotplug - * etc), we actually want to start out with a unity factor. - */ - if (data->correction_factor[data->bucket] == 0) - data->correction_factor[data->bucket] = RESOLUTION * DECAY; - - /* * Force the result of multiplication to be 64 bits even if both * operands are 32 bits. * Make sure to round up for half microseconds. @@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + int i; memset(data, 0, sizeof(struct menu_device)); + /* + * if the correction factor is 0 (eg first time init or cpu hotplug + * etc), we actually want to start out with a unity factor. + */ + for(i = 0; i < BUCKETS; i++) + data->correction_factor[i] = RESOLUTION * DECAY; + return 0; } -- cgit v0.10.2 From a6220fc19afc07fe77cfd16f5b8e568615517091 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 5 May 2014 00:51:54 +0200 Subject: PM / suspend: Always use deepest C-state in the "freeze" sleep state If freeze_enter() is called, we want to bypass the current cpuidle governor and always use the deepest available (that is, not disabled) C-state, because we want to save as much energy as reasonably possible then and runtime latency constraints don't matter at that point, since the system is in a sleep state anyway. Signed-off-by: Rafael J. Wysocki Tested-by: Aubrey Li diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f38359f..cb70199 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices); static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +static bool use_deepest_state __read_mostly; int cpuidle_disabled(void) { @@ -65,6 +66,45 @@ int cpuidle_play_dead(void) } /** + * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. + * @enable: Whether enable or disable the feature. + * + * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and + * always use the state with the greatest exit latency (out of the states that + * are not disabled). + * + * This function can only be called after cpuidle_pause() to avoid races. + */ +void cpuidle_use_deepest_state(bool enable) +{ + use_deepest_state = enable; +} + +/** + * cpuidle_find_deepest_state - Find the state of the greatest exit latency. + * @drv: cpuidle driver for a given CPU. + * @dev: cpuidle device for a given CPU. + */ +static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + unsigned int latency_req = 0; + int i, ret = CPUIDLE_DRIVER_STATE_START - 1; + + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable || s->exit_latency <= latency_req) + continue; + + latency_req = s->exit_latency; + ret = i; + } + return ret; +} + +/** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu * @drv: cpuidle driver for this cpu @@ -124,6 +164,9 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (!drv || !dev || !dev->enabled) return -EBUSY; + if (unlikely(use_deepest_state)) + return cpuidle_find_deepest_state(drv, dev); + return cpuidle_curr_governor->select(drv, dev); } @@ -155,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ void cpuidle_reflect(struct cpuidle_device *dev, int index) { - if (cpuidle_curr_governor->reflect) + if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) cpuidle_curr_governor->reflect(dev, index); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a8d5bd3..c51a436 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -143,6 +143,7 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); +extern void cpuidle_use_deepest_state(bool enable); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -175,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) {} static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4..155721f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -54,9 +54,11 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_use_deepest_state(true); cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); cpuidle_pause(); + cpuidle_use_deepest_state(false); } void freeze_wake(void) -- cgit v0.10.2 From 792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:27 -0400 Subject: sched/numa: Count pages on active node as local The NUMA code is smart enough to distribute the memory of workloads that span multiple NUMA nodes across those NUMA nodes. However, it still has a pretty high scan rate for such workloads, because any memory that is left on a node other than the node of the CPU that faulted on the memory is counted as non-local, which causes the scan rate to go up. Counting the memory on any node where the task's numa group is actively running as local, allows the scan rate to slow down once the application is settled in. This should reduce the overhead of the automatic NUMA placement code, when a workload spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d859ec..f6457b6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- cgit v0.10.2 From 5085e2a328849bdee6650b32d52c87c3788ab01c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:28 -0400 Subject: sched/numa: Retry placement more frequently when misplaced When tasks have not converged on their preferred nodes yet, we want to retry fairly often, to make sure we do not migrate a task's memory to an undesirable location, only to have to move it again later. This patch reduces the interval at which migration is retried, when the task's numa_scan_period is small. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6457b6..ecea8d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) -- cgit v0.10.2 From 68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:29 -0400 Subject: sched/numa: Do not set preferred_node on migration to a second choice node Setting the numa_preferred_node for a task in task_numa_migrate does nothing on a 2-node system. Either we migrate to the node that already was our preferred node, or we stay where we were. On a 4-node system, it can slightly decrease overhead, by not calling the NUMA code as much. Since every node tends to be directly connected to every other node, running on the wrong node for a while does not do much damage. However, on an 8 node system, there are far more bad nodes than there are good ones, and pretending that a second choice is actually the preferred node can greatly delay, or even prevent, a workload from converging. The only time we can safely pretend that a second choice node is the preferred node is when the task is part of a workload that spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecea8d9..051903f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an -- cgit v0.10.2 From 107437febd495a50e2cd09c81bbaa84d30e57b07 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 29 Apr 2014 15:36:15 -0400 Subject: mm/numa: Remove BUG_ON() in __handle_mm_fault() Changing PTEs and PMDs to pte_numa & pmd_numa is done with the mmap_sem held for reading, which means a pmd can be instantiated and turned into a numa one while __handle_mm_fault() is examining the value of old_pmd. If that happens, __handle_mm_fault() should just return and let the page fault retry, instead of throwing an oops. This is handled by the test for pmd_trans_huge(*pmd) below. Signed-off-by: Rik van Riel Reviewed-by: Naoya Horiguchi Reported-by: Sunil Pandey Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: linux-mm@kvack.org Cc: lwoodman@redhat.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20140429153615.2d72098e@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/mm/memory.c b/mm/memory.c index d0f0bef..9c2dc65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3900,9 +3900,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- cgit v0.10.2 From 143e1e28cb40bed836b0a06567208bd7347c9672 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:37 +0200 Subject: sched: Rework sched_domain topology definition We replace the old way to configure the scheduler topology with a new method which enables a platform to declare additionnal level (if needed). We still have a default topology table definition that can be used by platform that don't want more level than the SMT, MC, CPU and NUMA ones. This table can be overwritten by an arch which either wants to add new level where a load balance make sense like BOOK or powergating level or wants to change the flags configuration of some levels. For each level, we need a function pointer that returns cpumask for each cpu, a function pointer that returns the flags for the level and a name. Only flags that describe topology, can be set by an architecture. The current topology flags are: SD_SHARE_CPUPOWER SD_SHARE_PKG_RESOURCES SD_NUMA SD_ASYM_PACKING Then, each level must be a subset on the next one. The build sequence of the sched_domain will take care of removing useless levels like those with 1 CPU and those with the same CPU span and no more relevant information for load balancing than its children. Signed-off-by: Vincent Guittot Tested-by: Dietmar Eggemann Reviewed-by: Preeti U Murthy Reviewed-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Chris Metcalf Cc: Christoph Lameter Cc: David S. Miller Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Hanjun Guo Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Tony Luck Cc: linux390@de.ibm.com Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 5cb55a1..3202aa7 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -46,30 +46,6 @@ void build_cpu_to_node_map(void); -#define SD_CPU_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 05425b1..07763bdb 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -64,8 +64,6 @@ static inline void s390_init_cpu_topology(void) }; #endif -#define SD_BOOK_INIT SD_CPU_INIT - #include #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d15c0d8..9383118 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node) /* For now, use numa node -1 for global allocation. */ #define pcibus_to_node(bus) ((void)(bus), -1) -/* - * TILE architecture has many cores integrated in one processor, so we need - * setup bigger balance_interval for both CPU/NODE scheduling domains to - * reduce process scheduling costs. - */ - -/* sched_domains SD_CPU_INIT for TILE architecture */ -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 4, \ - .max_interval = 128, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 32, \ -} - /* By definition, we create nodes based on online memory. */ #define node_has_online_mem(nid) 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a4298f..656b035 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -879,6 +879,27 @@ enum cpu_idle_type { extern int __weak arch_sd_sibiling_asym_packing(void); +#ifdef CONFIG_SCHED_SMT +static inline const int cpu_smt_flags(void) +{ + return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_SCHED_MC +static inline const int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline const int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif + struct sched_domain_attr { int relax_domain_level; }; @@ -985,6 +1006,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern struct sched_domain_topology_level *sched_domain_topology; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7062330..973671f 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -66,121 +66,6 @@ int arch_update_cpu_topology(void); #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif -/* - * Below are the 3 major initializers used in building sched_domains: - * SD_SIBLING_INIT, for SMT domains - * SD_CPU_INIT, for SMP domains - * - * Any architecture that cares to do any tuning to these values should do so - * by defining their own arch-specific initializer in include/asm/topology.h. - * A definition there will automagically override these default initializers - * and allow arch-specific performance tuning of sched_domains. - * (Only non-zero and non-null fields need be specified.) - */ - -#ifdef CONFIG_SCHED_SMT -/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, - * so can't we drop this in favor of CONFIG_SCHED_SMT? - */ -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 1*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - | arch_sd_sibling_asym_packing() \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_SMT */ - -#ifdef CONFIG_SCHED_MC -/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ -#ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_MC */ - -/* Common values for CPUs */ -#ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 1*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif - -#ifdef CONFIG_SCHED_BOOK -#ifndef SD_BOOK_INIT -#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! -#endif -#endif /* CONFIG_SCHED_BOOK */ - #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); @@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu) #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifdef CONFIG_SCHED_SMT +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +static inline const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + + #endif /* _LINUX_TOPOLOGY_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1..7d332b7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5566,17 +5566,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5589,21 +5578,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5832,34 +5806,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5947,99 +5893,156 @@ static void claim_allocations(int cpu, struct sched_domain *sd) *per_cpu_ptr(sdd->sgp, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) - #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUPOWER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE + | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUPOWER) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + sd->flags |= arch_sd_sibling_asym_packing(); + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +#ifdef CONFIG_SCHED_BOOK + { cpu_book_mask, SD_INIT_NAME(BOOK) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6183,7 +6186,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6191,18 +6197,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6360,7 +6367,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; -- cgit v0.10.2 From 2dfd747629e65f89d2dbceb92fffc763f66228b2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:38 +0200 Subject: sched, s390: Create a dedicated topology table BOOK level is only relevant for s390 so we create a dedicated topology table with BOOK level and remove it from default table. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Philipp Hachtmann Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: dietmar.eggemann@arm.com Cc: preeti@linux.vnet.ibm.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: linux390@de.ibm.com Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 07763bdb..56af530 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; #define mc_capable() 1 -static inline const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_mask; -} - -static inline const struct cpumask *cpu_book_mask(int cpu) -{ - return &cpu_topology[cpu].book_mask; -} - int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); void topology_expect_change(void); +const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_BOOK */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 6298fed..1860ad3 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -443,6 +443,23 @@ int topology_cpu_init(struct cpu *cpu) return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); } +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int __init topology_init(void) { if (!MACHINE_HAS_TOPOLOGY) { @@ -451,6 +468,9 @@ static int __init topology_init(void) } set_topology_timer(); out: + + set_sched_topology(s390_topology); + return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d332b7..e59e5ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6024,9 +6024,6 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif -#ifdef CONFIG_SCHED_BOOK - { cpu_book_mask, SD_INIT_NAME(BOOK) }, -#endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; -- cgit v0.10.2 From 607b45e9a216e89a63351556e488eea06be0ff48 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:39 +0200 Subject: sched, powerpc: Create a dedicated topology table Create a dedicated topology table for handling asymetric feature of powerpc. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U. Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: dietmar.eggemann@arm.com Cc: devicetree@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1397209481-28542-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e2a4232..10ffffe 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static const int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -} + set_sched_topology(powerpc_topology); -int arch_sd_sibling_asym_packing(void) -{ - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - return SD_ASYM_PACKING; - } - return 0; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/include/linux/sched.h b/include/linux/sched.h index 656b035..439a153 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -877,8 +877,6 @@ enum cpu_idle_type { #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ -extern int __weak arch_sd_sibiling_asym_packing(void); - #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e59e5ae..7e348e2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5796,11 +5796,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -5981,7 +5976,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) if (sd->flags & SD_SHARE_CPUPOWER) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ - sd->flags |= arch_sd_sibling_asym_packing(); } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit v0.10.2 From d77b3ed5c9f8ebedf154b52b5e943c461f3d37e6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:40 +0200 Subject: sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain A new flag SD_SHARE_POWERDOMAIN is created to reflect whether groups of CPUs in a sched_domain level can or not reach different power state. As an example, the flag should be cleared at CPU level if groups of cores can be power gated independently. This information can be used in the load balance decision or to add load balancing level between group of CPUs that can power gate independantly. This flag is part of the topology flags that can be set by arch. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1397209481-28542-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 439a153..accb66b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,6 +870,7 @@ enum cpu_idle_type { #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e348e2..1c9c3b7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5261,7 +5261,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5292,7 +5293,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5901,6 +5903,7 @@ static int sched_domains_curr_level; * SD_SHARE_CPUPOWER - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -5909,7 +5912,8 @@ static int sched_domains_curr_level; (SD_SHARE_CPUPOWER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING) + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) -- cgit v0.10.2 From fb2aa85564f4de35d25db022ab93640f8bb51821 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:41 +0200 Subject: sched, ARM: Create a dedicated scheduler topology table Create a dedicated topology table for ARM which will create new level to differentiate CPUs that can or not powergate independantly from others. The patch gives an example of how to add domain that will take advantage of SD_SHARE_POWERDOMAIN. Signed-off-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Grant Likely Cc: Linus Torvalds Cc: Mark Brown Cc: Nicolas Pitre Cc: Rob Herring Cc: Russell King Cc: Sudeep KarkadaNagesha Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1397209481-28542-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 0bc94b1..71e1fec 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +/* + * The current assumption is that we can power gate each core independently. + * This will be superseded by DT binding once available. + */ +const struct cpumask *cpu_corepower_mask(int cpu) +{ + return &cpu_topology[cpu].thread_sibling; +} + static void update_siblings_masks(unsigned int cpuid) { struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } +static inline const int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; +} + +static struct sched_domain_topology_level arm_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + /* * init_cpu_topology is called at boot when only one cpu is running * which prevent simultaneous write access to cpu_topology array @@ -289,4 +312,7 @@ void __init init_cpu_topology(void) smp_wmb(); parse_dt_topology(); + + /* Set scheduler topology descriptor */ + set_sched_topology(arm_topology); } -- cgit v0.10.2 From 39a4d9ca77a31503c6317e49742341d0859d5cb2 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:35 -0700 Subject: sched/fair: Stop searching for tasks in newidle balance if there are runnable tasks It was found that when running some workloads (such as AIM7) on large systems with many cores, CPUs do not remain idle for long. Thus, tasks can wake/get enqueued while doing idle balancing. In this patch, while traversing the domains in idle balance, in addition to checking for pulled_task, we add an extra check for this_rq->nr_running for determining if we should stop searching for tasks to pull. If there are runnable tasks on this rq, then we will stop traversing the domains. This reduces the chance that idle balance delays a task from running. This patch resulted in approximately a 6% performance improvement when running a Java Server workload on an 8 socket machine. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1398303035-18255-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 051903f..28ccf50 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6713,7 +6713,6 @@ static int idle_balance(struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6728,7 +6727,12 @@ static int idle_balance(struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; } rcu_read_unlock(); -- cgit v0.10.2 From 2a705ad65c9c393df1a684c780f890a29aab1128 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 09:59:17 +0200 Subject: sched/idle, alpha: Switch from TS_POLLING to TIF_POLLING_NRFLAG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standardize the idle polling indicator to TIF_POLLING_NRFLAG such that both TIF_NEED_RESCHED and TIF_POLLING_NRFLAG are in the same word. This will allow us, using fetch_or(), to both set NEED_RESCHED and check for POLLING_NRFLAG in a single operation and avoid pointless wakeups. Changing from the non-atomic thread_info::status flags to the atomic thread_info::flags shouldn't be a big issue since most polling state changes were followed/preceded by a full memory barrier anyway. Signed-off-by: Peter Zijlstra Acked-by: Richard Henderson Cc: Andy Lutomirski Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Matt Turner Cc: Richard Henderson Cc: 蔡正龙 Cc: linux-alpha@vger.kernel.org Link: http://lkml.kernel.org/n/tip-9tfzr196gs0n2afxv0ga8pc3@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 3d6ce6d..48bbea6 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -73,12 +73,14 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_SYSCALL_AUDIT 4 /* syscall audit active */ #define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */ #define TIF_MEMDIE 13 /* is terminating due to OOM killer */ +#define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1< Date: Fri, 11 Apr 2014 09:59:13 +0200 Subject: sched/idle, tile: Switch from TS_POLLING to TIF_POLLING_NRFLAG Standardize the idle polling indicator to TIF_POLLING_NRFLAG such that both TIF_NEED_RESCHED and TIF_POLLING_NRFLAG are in the same word. This will allow us, using fetch_or(), to both set NEED_RESCHED and check for POLLING_NRFLAG in a single operation and avoid pointless wakeups. Changing from the non-atomic thread_info::status flags to the atomic thread_info::flags shouldn't be a big issue since most polling state changes were followed/preceded by a full memory barrier anyway. Signed-off-by: Peter Zijlstra Acked-by: Chris Metcalf Cc: Andy Lutomirski Cc: Chris Metcalf Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-35zzwlvwr7cp8xj196y10yyx@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 729aa10..d767ff9 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -129,6 +129,7 @@ extern void _cpu_idle(void); #define TIF_MEMDIE 7 /* OOM killer at work */ #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ +#define TIF_POLLING_NRFLAG 10 /* idle is polling for TIF_NEED_RESCHED */ #define _TIF_SIGPENDING (1< Date: Fri, 11 Apr 2014 09:59:08 +0200 Subject: sched/idle, ia64: Switch from TS_POLLING to TIF_POLLING_NRFLAG Standardize the idle polling indicator to TIF_POLLING_NRFLAG such that both TIF_NEED_RESCHED and TIF_POLLING_NRFLAG are in the same word. This will allow us, using fetch_or(), to both set NEED_RESCHED and check for POLLING_NRFLAG in a single operation and avoid pointless wakeups. Changing from the non-atomic thread_info::status flags to the atomic thread_info::flags shouldn't be a big issue since most polling state changes were followed/preceded by a full memory barrier anyway. Signed-off-by: Peter Zijlstra Cc: Fenghua Yu Cc: Linus Torvalds Cc: Tony Luck Cc: Andy Lutomirski Cc: linux-ia64@vger.kernel.org Link: http://lkml.kernel.org/n/tip-6633akuird5hi3si4gbegkm8@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 5957cf6..5b17418 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -107,6 +107,7 @@ struct thread_info { #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ #define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */ +#define TIF_POLLING_NRFLAG 22 /* idle is polling for TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -118,6 +119,7 @@ struct thread_info { #define _TIF_MCA_INIT (1 << TIF_MCA_INIT) #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) #define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) /* "work to do on user-return" bits */ #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ @@ -125,7 +127,6 @@ struct thread_info { /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) -#define TS_POLLING 1 /* true if in idle loop and not sleeping */ #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ #ifndef __ASSEMBLY__ -- cgit v0.10.2 From f80c5b39b80ab4d52acd940c03b89948cafdbd18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 09:59:05 +0200 Subject: sched/idle, x86: Switch from TS_POLLING to TIF_POLLING_NRFLAG Standardize the idle polling indicator to TIF_POLLING_NRFLAG such that both TIF_NEED_RESCHED and TIF_POLLING_NRFLAG are in the same word. This will allow us, using fetch_or(), to both set NEED_RESCHED and check for POLLING_NRFLAG in a single operation and avoid pointless wakeups. Changing from the non-atomic thread_info::status flags to the atomic thread_info::flags shouldn't be a big issue since most polling state changes were followed/preceded by a full memory barrier anyway. Also, fix up the apm_32 idle function, clearly that was forgotten in the last conversion. The default idle state is !POLLING so just kill the lot. Signed-off-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Jiri Kosina Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-7yksmqtlv4nfowmlqr1rifoi@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 47e5de2..8540538 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ +#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -106,6 +107,7 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void) * have to worry about atomic accesses. */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* idle task polling need_resched, - skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab0343..f3a1f04 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; -- cgit v0.10.2 From 69dd0f848879328ae6c6f54c2ec80e49eef042d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 14:30:10 +0200 Subject: sched/idle: Remove TS_POLLING support Now that there are no architectures left using it, kill the support for TS_POLLING. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andy Lutomirski Link: http://lkml.kernel.org/n/tip-6yurip2tfix2f4bfc5agu2s0@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index accb66b..725eef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2775,51 +2775,9 @@ static inline int spin_needbreak(spinlock_t *lock) /* * Idle thread specific functions to determine the need_resched - * polling state. We have two versions, one based on TS_POLLING in - * thread_info.status and one based on TIF_POLLING_NRFLAG in - * thread_info.flags + * polling state. */ -#ifdef TS_POLLING -static inline int tsk_is_polling(struct task_struct *p) -{ - return task_thread_info(p)->status & TS_POLLING; -} -static inline void __current_set_polling(void) -{ - current_thread_info()->status |= TS_POLLING; -} - -static inline bool __must_check current_set_polling_and_test(void) -{ - __current_set_polling(); - - /* - * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() - */ - smp_mb(); - - return unlikely(tif_need_resched()); -} - -static inline void __current_clr_polling(void) -{ - current_thread_info()->status &= ~TS_POLLING; -} - -static inline bool __must_check current_clr_polling_and_test(void) -{ - __current_clr_polling(); - - /* - * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() - */ - smp_mb(); - - return unlikely(tif_need_resched()); -} -#elif defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); -- cgit v0.10.2 From fd99f91aa007ba255aac44fe6cf21c1db398243a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 15:35:08 +0200 Subject: sched/idle: Avoid spurious wakeup IPIs Because mwait_idle_with_hints() gets called from !idle context it must call current_clr_polling(). This however means that resched_task() is very likely to send an IPI even when we were polling: CPU0 CPU1 if (current_set_polling_and_test()) goto out; __monitor(&ti->flags); if (!need_resched()) __mwait(eax, ecx); set_tsk_need_resched(p); smp_mb(); out: current_clr_polling(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); So while it is correct (extra IPIs aren't a problem, whereas a missed IPI would be) it is a performance problem (for some). Avoid this issue by using fetch_or() to atomically set NEED_RESCHED and test if POLLING_NRFLAG is set. Since a CPU stuck in mwait is unlikely to modify the flags word, contention on the cmpxchg is unlikely and thus we should mostly succeed in a single go. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kf5suce6njh5xf5d3od13rr0@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1c9c3b7..4b82622 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -506,6 +506,39 @@ static inline void init_hrtick(void) #endif /* CONFIG_SCHED_HRTICK */ /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} +#endif + +/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it @@ -521,17 +554,15 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); } -- cgit v0.10.2 From c444117f0f39d59733ec23da67c44424df529230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:47:16 +0200 Subject: sched/idle: Delay clearing the polling bit With the generic idle functions assuming !polling we should only clear the polling bit at the very last opportunity in order to avoid spurious IPIs. Ideally we'd flip the default to polling, but that means auditing all arch idle functions. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-vq7719foqzf6z5h4j7eh7f9e@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a..ed67f0c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,12 +78,10 @@ static int cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq and - * set again the polling flag + * case, exit the function after re-enabling the local irq. */ - if (current_clr_polling_and_test()) { + if (need_resched()) { local_irq_enable(); - __current_set_polling(); return 0; } @@ -127,7 +125,7 @@ static int cpuidle_idle_call(void) broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); - if (broadcast) + if (broadcast) { /* * Tell the time framework to switch * to a broadcast timer because our @@ -139,6 +137,7 @@ static int cpuidle_idle_call(void) ret = clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); + } if (!ret) { trace_cpu_idle_rcuidle(next_state, dev->cpu); @@ -175,8 +174,12 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) - arch_cpu_idle(); + if (ret) { + if (!current_clr_polling_and_test()) + arch_cpu_idle(); + else + local_irq_enable(); + } __current_set_polling(); -- cgit v0.10.2 From 37352273ad48f2d177ed1b06ced32d5536b773fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:55:48 +0200 Subject: sched/idle: Reflow cpuidle_idle_call() Apply goto to reduce lines and nesting levels. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-cc6vb0snt3sr7op6rlbfeqfh@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ed67f0c..88a6bc4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -73,7 +73,7 @@ static int cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); - int next_state, entered_state, ret; + int next_state, entered_state; bool broadcast; /* @@ -102,90 +102,75 @@ static int cpuidle_idle_call(void) * Check if the cpuidle framework is ready, otherwise fallback * to the default arch specific idle method */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); - + if (cpuidle_enabled(drv, dev)) { +use_default: /* - * The idle task must be scheduled, it is pointless to - * go to idle, just update no idle residency and get - * out of this function + * We can't use the cpuidle framework, let's use the default + * idle routine. */ - if (current_clr_polling_and_test()) { - dev->last_residency = 0; - entered_state = next_state; + if (current_clr_polling_and_test()) local_irq_enable(); - } else { - broadcast = !!(drv->states[next_state].flags & - CPUIDLE_FLAG_TIMER_STOP); - - if (broadcast) { - /* - * Tell the time framework to switch - * to a broadcast timer because our - * local timer will be shutdown. If a - * local timer is used from another - * cpu as a broadcast timer, this call - * may fail if it is not available - */ - ret = clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_ENTER, - &dev->cpu); - } - - if (!ret) { - trace_cpu_idle_rcuidle(next_state, dev->cpu); - - /* - * Enter the idle state previously - * returned by the governor - * decision. This function will block - * until an interrupt occurs and will - * take care of re-enabling the local - * interrupts - */ - entered_state = cpuidle_enter(drv, dev, - next_state); - - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, - dev->cpu); - - if (broadcast) - clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_EXIT, - &dev->cpu); - - /* - * Give the governor an opportunity to reflect on the - * outcome - */ - cpuidle_reflect(dev, entered_state); - } - } + else + arch_cpu_idle(); + + goto exit_idle; } /* - * We can't use the cpuidle framework, let's use the default - * idle routine + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state */ - if (ret) { - if (!current_clr_polling_and_test()) - arch_cpu_idle(); - else - local_irq_enable(); + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + goto exit_idle; } + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + /* + * Tell the time framework to switch to a broadcast timer + * because our local timer will be shutdown. If a local timer + * is used from another cpu as a broadcast timer, this call may + * fail if it is not available + */ + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + goto use_default; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the outcome + */ + cpuidle_reflect(dev, entered_state); + +exit_idle: __current_set_polling(); /* - * It is up to the idle functions to enable back the local - * interrupt + * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); -- cgit v0.10.2 From 08c373e5123b4595588ae1a7aa7e00a046c61cc6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Apr 2014 01:26:58 +0200 Subject: sched/idle: Make cpuidle_idle_call() void The only value ever returned by cpuidle_idle_call() is 0 and its only caller ignores that value anyway, so make it void. Signed-off-by: Rafael J. Wysocki Cc: Daniel Lezcano Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4717784.WmVEpDoliM@vostro.rjw.lan Signed-off-by: Ingo Molnar diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 88a6bc4..34083c9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,9 +67,8 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here - * return non-zero on failure */ -static int cpuidle_idle_call(void) +static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -82,7 +81,7 @@ static int cpuidle_idle_call(void) */ if (need_resched()) { local_irq_enable(); - return 0; + return; } /* @@ -177,8 +176,6 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); - - return 0; } /* -- cgit v0.10.2 From 31e6cdefd377ae90bea5c6a4e1dc519793577a24 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 9 May 2014 15:36:21 +0100 Subject: metag: Remove TIF_POLLING_NRFLAG The Meta idle function jumps into the interrupt handler which efficiently blocks waiting for the next interrupt when it reads the interrupt status register (TXSTATI). No other (polling) idle functions can be used, therefore TIF_POLLING_NRFLAG is unnecessary, so lets remove it. Peter Zijlstra said: > Most archs have (x86) hlt or (arm) wfi like idle instructions, and if > that is your only possible idle function, you'll require the interrupt > to wake up and there's really no point to having the POLLING bit. Signed-off-by: James Hogan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEB7E.9080007@imgtec.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index b19e9c5..4771133 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -117,10 +117,8 @@ static inline int kstack_end(void *addr) #define TIF_SECCOMP 5 /* secure computing */ #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ -#define TIF_POLLING_NRFLAG 8 /* true if poll_idle() is polling - TIF_NEED_RESCHED */ -#define TIF_MEMDIE 9 /* is terminating due to OOM killer */ -#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint instrumentation */ +#define TIF_MEMDIE 8 /* is terminating due to OOM killer */ +#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1< Date: Fri, 9 May 2014 19:04:00 +0200 Subject: arm64: Remove TIF_POLLING_NRFLAG The only idle method for arm64 is WFI and it therefore unconditionally requires the reschedule interrupt when idle. Suggested-by: Catalin Marinas Signed-off-by: Peter Zijlstra Acked-by: Catalin Marinas Link: http://lkml.kernel.org/r/20140509170649.GG13658@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 720e70b..7b8e3a2 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -95,13 +95,11 @@ static inline struct thread_info *current_thread_info(void) * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user * TIF_USEDFPU - FPU was used by this task this quantum (SMP) - * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 -#define TIF_POLLING_NRFLAG 16 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 #define TIF_RESTORE_SIGMASK 20 -- cgit v0.10.2 From 3944a9274ef6cda0cc282daf0739832f661670f7 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 15 May 2014 15:59:20 -0700 Subject: sched: Fix exec_start/task_hot on migrated tasks task_hot checks exec_start on any runnable task, but if it has been migrated since the it last ran, then exec_start is a clock_task from another cpu. If the old cpu's clock_task was sufficiently far ahead of this cpu's then the task will not be considered for another migration until it has run. Instead reset exec_start whenever a task is migrated, since it is presumably no longer hot anyway. Signed-off-by: Ben Segall [ Made it compile. ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140515225920.7179.13924.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28ccf50..dd3fa14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4544,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ -- cgit v0.10.2 From e78c7bca56dab5ce4f22694f99115ec07e4935f6 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:28 +0200 Subject: sched: Simplify return logic in sched_copy_attr() The logic in this function is a little contorted, clean it up: * Rather than having chained gotos for the -EFBIG case, just return -EFBIG directly. * Now, the label 'out' is no longer needed, and 'ret' must be zero zero by the time we fall through to this point, so just return 0. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC24.9080201@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2551b6d..2318fc4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3657,13 +3657,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** -- cgit v0.10.2 From 22400674945c562cf3c176d6c4178cd545e2dec9 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:33 +0200 Subject: sched: Simplify return logic in sched_read_attr() Gotos are chained pointlessly here, and the 'out' label can be dispensed with. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC29.9090503@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2318fc4..a78c5b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3821,7 +3821,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; @@ -3831,12 +3831,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** -- cgit v0.10.2 From 8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 11:40:37 -0400 Subject: sched: Call select_idle_sibling() when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: george.mccollister@gmail.com Cc: ktkhai@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140514114037.2d93266f@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14..429164d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- cgit v0.10.2 From c515db8cd311ef77b2dc7cbd6b695022655bb0f3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 May 2014 11:11:01 +0200 Subject: sched/numa: Fix initialization of sched_domain_topology for NUMA Jet Chen has reported a kernel panics when booting qemu-system-x86_64 with kvm64 cpu. A panic occured while building the sched_domain. In sched_init_numa, we create a new topology table in which both default levels and numa levels are copied. The last row of the table must have a null pointer in the mask field. The current implementation doesn't add this last row in the computation of the table size. So we add 1 row in the allocation size that will be used as the last row of the table. The kzalloc will ensure that the mask field is NULL. Reported-by: Jet Chen Tested-by: Jet Chen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1399972261-25693-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a78c5b6..45d077e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6231,7 +6231,7 @@ static void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level) * + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit v0.10.2 From caffcdd8d27ba78730d5540396ce72ad022aff2c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 30 Apr 2014 14:39:38 +0100 Subject: sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() There is no need to zero struct sched_group member cpumask and struct sched_group_power member power since both structures are already allocated as zeroed memory in __sdt_alloc(). This patch has been tested with BUG_ON(!cpumask_empty(sched_group_cpus(sg))); and BUG_ON(sg->sgp->power); in build_sched_groups() on ARM TC2 and INTEL i5 M520 platform including CPU hotplug scenarios. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398865178-12577-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45d077e..6340c60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5794,8 +5794,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { -- cgit v0.10.2 From a9467fa3cd2d5bf39e7cb7d0706d29d7ef4df212 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:35:15 +0900 Subject: sched: Use clamp() and clamp_val() to make sys_nice() more readable Suggested-by: Kees Cook Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399541715-19568-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6340c60..f5605b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,17 +3057,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); nice = task_nice(current) + increment; - if (nice < MIN_NICE) - nice = MIN_NICE; - if (nice > MAX_NICE) - nice = MAX_NICE; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; -- cgit v0.10.2 From 52a08ef1f13a11289c9e18cd4cfb4e51c024058b Mon Sep 17 00:00:00 2001 From: Jason Low Date: Thu, 8 May 2014 17:49:22 -0700 Subject: sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() Currently, in idle_balance(), we update rq->next_balance when we pull_tasks. However, it is also important to update this in the !pulled_tasks case too. When the CPU is "busy" (the CPU isn't idle), rq->next_balance gets computed using sd->busy_factor (so we increase the balance interval when the CPU is busy). However, when the CPU goes idle, rq->next_balance could still be set to a large value that was computed with the sd->busy_factor. Thus, we need to also update rq->next_balance in idle_balance() in the cases where !pulled_tasks too, so that rq->next_balance gets updated without taking the busy_factor into account when the CPU is about to go idle. This patch makes rq->next_balance get updated independently of whether or not we pulled_task. Also, we add logic to ensure that we always traverse at least 1 of the sched domains to get a proper next_balance value for updating rq->next_balance. Additionally, since load_balance() modifies the sd->balance_interval, we need to re-obtain the sched domain's interval after the call to load_balance() in rebalance_domains() before we update rq->next_balance. This patch adds and uses 2 new helper functions, update_next_balance() and get_sd_balance_interval() to update next_balance and obtain the sched domain's balance_interval. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Link: http://lkml.kernel.org/r/1399596562.2200.7.camel@j-VirtualBox Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 429164d..26ec668 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6672,17 +6672,44 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; - int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); @@ -6692,8 +6719,15 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6703,15 +6737,16 @@ static int idle_balance(struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); @@ -6727,9 +6762,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; + update_next_balance(sd, 0, &next_balance); /* * Stop searching for tasks to pull if there are @@ -6753,15 +6786,11 @@ static int idle_balance(struct rq *this_rq) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - } -out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; @@ -7044,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -7069,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); -- cgit v0.10.2 From 72465447867b9de6b5cdea5d10f9781585136270 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 9 May 2014 03:00:14 +0400 Subject: sched, nohz: Change rq->nr_running to always use wrappers Sometimes ->nr_running may cross 2 but interrupt is not being sent to rq's cpu. In this case we don't reenable the timer. Looks like this may be the reason for rare unexpected effects, if nohz is enabled. Patch replaces all places of direct changing of nr_running and makes add_nr_running() caring about crossing border. Signed-off-by: Kirill Tkhai Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140508225830.2469.97461.stgit@localhost Signed-off-by: Ingo Molnar diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b..e0a04ae 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; - inc_nr_running(rq_of_dl_rq(dl_rq)); + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - dec_nr_running(rq_of_dl_rq(dl_rq)); + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26ec668..f7cac2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3325,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -3376,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3908,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3968,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7795e29..0ebfd7a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -973,7 +973,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - rq->nr_running -= rt_rq->rt_nr_running; + sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; } @@ -989,7 +989,7 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) return; - rq->nr_running += rt_rq->rt_nr_running; + add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2cbe81..600e229 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1206,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -static inline void inc_nr_running(struct rq *rq) +static inline void add_nr_running(struct rq *rq, unsigned count) { - rq->nr_running++; + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; #ifdef CONFIG_NO_HZ_FULL - if (rq->nr_running == 2) { + if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); @@ -1221,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq) #endif } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count) { - rq->nr_running--; + rq->nr_running -= count; } static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65d..bfe0eda 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - inc_nr_running(rq); + add_nr_running(rq, 1); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - dec_nr_running(rq); + sub_nr_running(rq, 1); } static void yield_task_stop(struct rq *rq) -- cgit v0.10.2 From a803f0261bb2bb57aab5542af3174db43b2a3887 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 8 May 2014 13:47:39 -0500 Subject: sched: Initialize rq->age_stamp on processor start If the sched_clock time starts at a large value, the kernel will spin in sched_avg_update for a long time while rq->age_stamp catches up with rq->clock. The comment in kernel/sched/clock.c says that there is no strict promise that it starts at zero. So initialize rq->age_stamp when a cpu starts up to avoid this. I was seeing long delays on a simulator that didn't start the clock at zero. This might also be an issue on reboots on processors that don't re-initialize the timer to zero on reset, and when using kexec. Signed-off-by: Corey Minyard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399574859-11714-1-git-send-email-minyard@acm.org Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5605b6..da302ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5089,10 +5089,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6970,6 +6980,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); -- cgit v0.10.2 From 7aa2c016db2162defff77f6f5731bff3f25e5175 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:33:49 +0900 Subject: sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/a568a1e3cc8e78648f41b5035fa5e381d36274da.1399532322.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 179b21b..9311bb6 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -436,7 +436,7 @@ static void binder_set_nice(long nice) set_user_nice(current, nice); return; } - min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ac32258..d9cf5a5 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -41,4 +41,20 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + #endif /* _SCHED_PRIO_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da302ca..321d800 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29..66a751e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) do_each_thread(g, p) { if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } -- cgit v0.10.2 From 4027d080854d1be96ef134a1c3024d5276114db6 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Fri, 9 May 2014 03:21:27 +0000 Subject: sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399605687-18094-1-git-send-email-xiaofeng.yan@huawei.com Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 725eef1..0f91d00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,8 +1175,8 @@ struct sched_dl_entity { /* * Original scheduling parameters. Copied here from sched_attr - * during sched_setscheduler2(), they will remain the same until - * the next sched_setscheduler2(). + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). */ u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e0a04ae..f9ca7d1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setscheduler()). + * parameters (through sched_setattr()). */ if (!dl_task(p) || dl_se->dl_new) goto unlock; -- cgit v0.10.2 From e63da03639cc9e6e83b62e7ef8ffdbb92421416a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 13:22:21 -0400 Subject: sched/numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140514132221.274b3463@annuminas.surriel.com diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2b..b899613 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- cgit v0.10.2 From b1ad065e65f56103db8b97edbd218a271ff5b1bb Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 15 May 2014 13:03:06 -0400 Subject: sched/numa: Update migrate_improves/degrades_locality() Update the migrate_improves/degrades_locality() functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/20140515130306.61aae7db@cuia.bos.redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613..503f750 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- cgit v0.10.2 From 096aa33863a5e48de52d2ff30e0801b7487944f4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 16 May 2014 00:13:32 -0400 Subject: sched/numa: Decay ->wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20140516001332.67f91af2@annuminas.surriel.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750..c9617b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit v0.10.2