diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e850a23dd6ecd9c76743f579bad5f5f4df0d3cd5..197422a1598c38d21a01695286bd6f9e473357bc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -418,10 +418,20 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; + + /* + * The first node a task was scheduled on. If a task runs on + * a different node than Make PTE Scan Go Now. + */ + int first_nid; #endif struct uprobes_state uprobes_state; }; +/* first nid will either be a valid NID or one of these values */ +#define NUMA_PTE_SCAN_INIT -1 +#define NUMA_PTE_SCAN_ACTIVE -2 + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2951eff91a4e09e0af23a90992747..296ea308096d4c37bd03ff45d833bd32474d7081 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,6 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a02a2082e95154d5094ded219b20f76ac98a525..3e18f611a5aa6d15e41c2e32186da7587e386ef7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -860,6 +860,24 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + /* * Reset the scan period if enough time has gone by. Objective is that * scanning will be reduced if pages are properly placed. As tasks diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d2373a3e32528ae6869248188108dc2283cd249e..e7c25fff1e94df70faab87d4d97772ba2a2cdd4e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) #endif