summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--kernel/sched/features.h4
4 files changed, 34 insertions, 1 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e850a23..197422a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -418,10 +418,20 @@ struct mm_struct {
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /*
+ * The first node a task was scheduled on. If a task runs on
+ * a different node than Make PTE Scan Go Now.
+ */
+ int first_nid;
#endif
struct uprobes_state uprobes_state;
};
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT -1
+#define NUMA_PTE_SCAN_ACTIVE -2
+
static inline void mm_init_cpumask(struct mm_struct *mm)
{
#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7..296ea30 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -821,6 +821,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a02a20..3e18f61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -861,6 +861,24 @@ void task_numa_work(struct callback_head *work)
return;
/*
+ * We do not care about task placement until a task runs on a node
+ * other than the first one used by the address space. This is
+ * largely because migrations are driven by what CPU the task
+ * is running on. If it's never scheduled on another node, it'll
+ * not migrate so why bother trapping the fault.
+ */
+ if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+ mm->first_nid = numa_node_id();
+ if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+ /* Are we running on a new node yet? */
+ if (numa_node_id() == mm->first_nid &&
+ !sched_feat_numa(NUMA_FORCE))
+ return;
+
+ mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+ }
+
+ /*
* Reset the scan period if enough time has gone by. Objective is that
* scanning will be reduced if pages are properly placed. As tasks
* can enter different phases this needs to be re-examined. Lacking
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d2373a3..e7c25ff 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, false)
+SCHED_FEAT(NUMA_FORCE, false)
#endif