diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 067a90a1499c49f3ffe20a0e10829fbf2a8ac661..5a6afecbb0d070c05b86e4bbdb760a8dbb6554f5 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -245,6 +245,13 @@ MPOL_INTERLEAVED address range or file. During system boot up, the temporary interleaved system default policy works in this mode. +MPOL_PREFERRED_MANY + This mode specifices that the allocation should be preferrably + satisfied from the nodemask specified in the policy. If there is + a memory pressure on all nodes in the nodemask, the allocation + can fall back to all existing numa nodes. This is effectively + MPOL_PREFERRED allowed for a mask rather than a single node. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES @@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES nodes changes after the memory policy has been defined. Without this flag, any time a mempolicy is rebound because of a - change in the set of allowed nodes, the node (Preferred) or - nodemask (Bind, Interleave) is remapped to the new set of - allowed nodes. This may result in nodes being used that were - previously undesired. + change in the set of allowed nodes, the preferred nodemask (Preferred + Many), preferred node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. This may result in nodes + being used that were previously undesired. With this flag, if the user-specified nodes overlap with the nodes allowed by the task's cpuset, then the memory policy is @@ -401,7 +408,7 @@ follows: Memory Policy APIs ================== -Linux supports 3 system calls for controlling memory policy. These APIS +Linux supports 4 system calls for controlling memory policy. These APIS always affect only the calling task, the calling task's address space, or some shared object mapped into the calling task's address space. @@ -453,6 +460,20 @@ requested via the 'flags' argument. See the mbind(2) man page for more details. +Set home node for a Range of Task's Address Spacec:: + + long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); + +sys_set_mempolicy_home_node set the home node for a VMA policy present in the +task's address range. The system call updates the home node only for the existing +mempolicy range. Other address ranges are ignored. A home node is the NUMA node +closest to which page allocation will come from. Specifying the home node override +the default allocation policy to allocate memory close to the local node for an +executing CPU. + + Memory Policy Command Line Interface ==================================== diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index dc18bb3681866168ed8c6910ee12bf4582bd732c..6e7d777ca2ad8bb7d5a132c0c9ee986580766da8 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 2a5d0253a7cb14eed7cb7161144990f7672edbce..735b12e65c6958c2ae93684cf64c558fccc092de 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,7 +463,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 39ce73c8084d3016fbd1eaadb7a7b7ce42a446b5..60e89a6ba3bd9e0b5cb4feb06b829671eb3534b6 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -42,7 +42,6 @@ #define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE + 2) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) - #define __NR_compat_syscalls 457 #endif diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 3a278103afd9033c9e5a0d42429746a9fcaf33f0..10d1b21f09d220436d3517c4567cfe4eb392d76a 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -908,8 +908,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index d5b5e5d1bd39acb682189b87fa5f6f43d2fc2c04..d6eba311cb3df8eed8044515458e4c570ee24ce8 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -364,3 +364,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index bdf6263478f5fba4c25fb790fbc57d304e19408b..5ac52266223177394ebeb01416937a55f8dca26b 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 5ad55e42b655f0f1003f588b37f931e0fe318eb9..8b95ed1df39afef6725e87d2f72ede252a433e44 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 11e7fe97b995872ac4497ab5f656f6e9800d8214..6d2913ecc903ac3aec83ce7e5174e242582d62e5 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -382,3 +382,4 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index b09a3a8b3f77ff0938fea9feb3933f0eceb0cc3f..215d92ddeb0db163cb00e5abf59d06ddce129ec7 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -358,3 +358,4 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 263b4e4287642cf4ed90dcd833a63cd6805d7b67..fea634250d2ddbff3e95c0bd59591f8af212ae8f 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -431,3 +431,4 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index d699b3a3c728543e38812c1978bbe79683507348..df9294268449dbd47a7567fe64ddae8dd3528825 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -441,3 +441,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f48c87f72d9d06a29bbd783f1eddab49f9531398..2a7c804b00b8a00fe32c1a99902846a15ac5b156 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -539,7 +539,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index c40d1e5a904b3c581e0a2ed2dd2531e48629acdd..4830c583b2be1c5be7a188e173f0f1f9d3cd65cf 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 162485313465895a9b80061ba7f0aa1969c6b98d..1c10af5c6dc7af0d975f6c90f078148a02d63320 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index bf6c45e30a66a4a9c10146188394cc53f54d876e..bda8abd0878e870a634a135c0e2597b0c7c96024 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c89fdfb18c8b00c3eb50c5954a9f1ed181e719a9..c71dc25932b6249aa99e1e43cf3970f3fa13635f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,7 +454,7 @@ 447 i386 kabi_reserved447 sys_ni_syscall 448 i386 kabi_reserved448 sys_ni_syscall 449 i386 kabi_reserved449 sys_ni_syscall -450 i386 kabi_reserved450 sys_ni_syscall +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 kabi_reserved451 sys_ni_syscall 452 i386 kabi_reserved452 sys_ni_syscall 453 i386 kabi_reserved453 sys_ni_syscall diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5775a88d1d026c3b4cc2256a4c931d7d79a6178d..a4f8fe52639b5187fd6e4c6759f3461194d8672e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,7 +371,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 9b083a7b4e23dd55f814570e5bb177a225afea81..e1c88f068aa12327605f214e85400be36c99b890 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -414,3 +414,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ba74e7399dc6600a3dc54ba2724d158b000d9c59..d4920e4a3e3851e9bef3253e0621022b992d61de 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -57,6 +57,11 @@ struct mempolicy { } w; }; +struct mempolicy_wrapper { + struct mempolicy policy; + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ +}; + /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. @@ -154,13 +159,6 @@ extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - struct mempolicy *mpol = get_task_policy(current); - - return policy_nodemask(gfp, mpol); -} - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -204,6 +202,14 @@ extern void mpol_put_task_policy(struct task_struct *); extern long __do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} + +extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); + #else struct mempolicy {}; @@ -315,9 +321,9 @@ static inline void mpol_put_task_policy(struct task_struct *task) { } -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) +static inline bool mpol_is_preferred_many(struct mempolicy *pol) { - return NULL; + return false; } #endif /* CONFIG_NUMA */ #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5a9192eae603ed39a2fd97f33173cb4b09da9590..5e3ca1c49f3f15a90813759cea1b32a85260aabd 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1039,6 +1039,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 72f9235614dcab69a974ec16f73e4f1db82e8ebd..064f512fefd987ecfc6a72841bf4445b858ef30c 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -878,8 +878,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3354774af61e44272f9b30a10dbad465c4270f18..7c4ffc207f67945fe5d6ed904f36f579b5dfcdfd 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -22,6 +22,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_PREFERRED_MANY, MPOL_MAX, /* always last member of enum */ }; @@ -58,7 +59,6 @@ enum { * are never OR'ed into the mode in mempolicy API arguments. */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ -#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 218dc39a4e32d39a244b9ae7c5a67d14c17cabbb..d264ee7e3d870898a03b8adecf426a24d7549726 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -298,6 +298,7 @@ COND_SYSCALL(migrate_pages); COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL_COMPAT(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1f641afc07562b8f0d937cb21d8ba77978a526af..a55197135afae76f6ea97d8d735435e00bee94f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1154,7 +1154,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1175,7 +1175,19 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, mpol); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, + mpol); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, + mpol); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2200,16 +2212,26 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); - mpol_cond_put(mpol); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); return page; } @@ -3850,19 +3872,35 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->v.nodes))) + return &mpol->v.nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || - (mpol_allowed && node_isset(node, *mpol_allowed))) + if (!mbind_nodemask || + (mbind_nodemask && node_isset(node, *mbind_nodemask))) nr += array[node]; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2927e81c738a913eb7e592fad3dde22425b16ec..b58ec3f98896caa7db6b8bf1a8de68ff5884ac46 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -119,13 +122,14 @@ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ -static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, +static struct mempolicy_wrapper default_policy = { + .policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .mode = MPOL_LOCAL, + } }; -static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +static struct mempolicy_wrapper preferred_node_policy[MAX_NUMNODES]; /** * numa_map_to_online_node - Find closest online node @@ -163,13 +167,13 @@ struct mempolicy *get_task_policy(struct task_struct *p) node = numa_node_id(); if (node != NUMA_NO_NODE) { - pol = &preferred_node_policy[node]; + pol = &preferred_node_policy[node].policy; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } - return &default_policy; + return &default_policy.policy; } static const struct mempolicy_operations { @@ -236,12 +240,17 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - if (!nodes) - pol->flags |= MPOL_F_LOCAL; /* local allocation */ - else if (nodes_empty(*nodes)) - return -EINVAL; /* no allowed nodes */ - else - pol->v.preferred_node = first_node(*nodes); + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.preferred_node = first_node(*nodes); + return 0; +} + +static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; return 0; } @@ -256,8 +265,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes - * parameter with respect to the policy mode and flags. But, we need to - * handle an empty nodemask with MPOL_PREFERRED here. + * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. @@ -267,9 +275,14 @@ static int mpol_set_nodemask(struct mempolicy *pol, { int ret; - /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ - if (pol == NULL) + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) return 0; + /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); @@ -278,25 +291,18 @@ static int mpol_set_nodemask(struct mempolicy *pol, nodes_or(nsc->mask1, cdmmask, nsc->mask1); #endif VM_BUG_ON(!nodes); - if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) - nodes = NULL; /* explicit local allocation */ - else { - if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); - else - nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (mpol_store_user_nodemask(pol)) - pol->w.user_nodemask = *nodes; - else - pol->w.cpuset_mems_allowed = - cpuset_current_mems_allowed; - } + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (nodes) - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; else - ret = mpol_ops[pol->mode].create(pol, NULL); + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } @@ -307,6 +313,7 @@ static int mpol_set_nodemask(struct mempolicy *pol, static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { + struct mempolicy_wrapper *wrapper; struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", @@ -329,21 +336,24 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); + wrapper = container_of(policy, struct mempolicy_wrapper, policy); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + wrapper->home_node = NUMA_NO_NODE; return policy; } @@ -383,25 +393,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - nodemask_t tmp; - - if (pol->flags & MPOL_F_STATIC_NODES) { - int node = first_node(pol->w.user_nodemask); - - if (node_isset(node, *nodes)) { - pol->v.preferred_node = node; - pol->flags &= ~MPOL_F_LOCAL; - } else - pol->flags |= MPOL_F_LOCAL; - } else if (pol->flags & MPOL_F_RELATIVE_NODES) { - mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); - pol->v.preferred_node = first_node(tmp); - } else if (!(pol->flags & MPOL_F_LOCAL)) { - pol->v.preferred_node = node_remap(pol->v.preferred_node, - pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; - } + pol->w.cpuset_mems_allowed = *nodes; } /* @@ -415,7 +407,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; - if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -466,6 +458,13 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_preferred_many, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -938,18 +937,21 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (p == &default_policy.policy) return; switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_MANY: *nodes = p->v.nodes; break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ + break; + case MPOL_PREFERRED: - if (!(p->flags & MPOL_F_LOCAL)) - node_set(p->v.preferred_node, *nodes); - /* else return empty node mask for local allocation */ + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -1015,7 +1017,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return -EINVAL; if (!pol) - pol = &default_policy; /* indicates default behavior */ + pol = &default_policy.policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { @@ -1040,7 +1042,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, goto out; } } else { - *policy = pol == &default_policy ? MPOL_DEFAULT : + *policy = pol == &default_policy.policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing @@ -1493,26 +1495,113 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } +/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ +static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) +{ + *flags = *mode & MPOL_MODE_FLAGS; + *mode &= ~MPOL_MODE_FLAGS; + + if ((unsigned int)(*mode) >= MPOL_MAX) + return -EINVAL; + if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + + return 0; +} + static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { + unsigned short mode_flags; nodemask_t nodes; + int lmode = mode; int err; - unsigned short mode_flags; start = untagged_addr(start); - mode_flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if (mode >= MPOL_MAX) - return -EINVAL; - if ((mode_flags & MPOL_F_STATIC_NODES) && - (mode_flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_mbind(start, len, mode, mode_flags, &nodes, flags); + + return do_mbind(start, len, lmode, mode_flags, &nodes, flags); +} + +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct mempolicy_wrapper *wrapper; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + mpol_put(new); + err = -EOPNOTSUPP; + break; + } + + wrapper = container_of(new, struct mempolicy_wrapper, policy); + wrapper->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, @@ -1526,20 +1615,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { - int err; + unsigned short mode_flags; nodemask_t nodes; - unsigned short flags; + int lmode = mode; + int err; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; - flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)mode >= MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_set_mempolicy(mode, flags, &nodes); + + return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, @@ -1889,7 +1978,7 @@ bool vma_policy_mof(struct vm_area_struct *vma) return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -1915,22 +2004,36 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && + if (unlikely(mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && (cpuset_nodemask_valid_mems_allowed(&policy->v.nodes) || nodemask_has_cdm(policy->v.nodes))) return &policy->v.nodes; + if (mode == MPOL_PREFERRED_MANY) + return &policy->v.nodes; + return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { - if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + struct mempolicy_wrapper *warpper; + + warpper = container_of(policy, struct mempolicy_wrapper, policy); + if (policy->mode == MPOL_PREFERRED) { nd = policy->v.preferred_node; - else { + } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the @@ -1946,6 +2049,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) } } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + warpper->home_node != NUMA_NO_NODE) + return warpper->home_node; + return nd; } @@ -1974,20 +2082,19 @@ unsigned int mempolicy_slab_node(void) return node; policy = current->mempolicy; - if (!policy || policy->flags & MPOL_F_LOCAL) + if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: - /* - * handled MPOL_F_LOCAL above - */ return policy->v.preferred_node; case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -2001,6 +2108,8 @@ unsigned int mempolicy_slab_node(void) &policy->v.nodes); return z->zone ? zone_to_nid(z->zone) : node; } + case MPOL_LOCAL: + return node; default: BUG(); @@ -2066,12 +2175,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2079,16 +2188,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if ((*mpol)->mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->v.nodes; } return nid; @@ -2122,16 +2233,19 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: - if (mempolicy->flags & MPOL_F_LOCAL) - nid = numa_node_id(); - else - nid = mempolicy->v.preferred_node; + nid = mempolicy->v.preferred_node; init_nodemask_of_node(mask, nid); break; + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: - *mask = mempolicy->v.nodes; + *mask = mempolicy->v.nodes; + break; + + case MPOL_LOCAL: + nid = numa_node_id(); + init_nodemask_of_node(mask, nid); break; default: @@ -2206,6 +2320,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->v.nodes); + if (!page) + page = __alloc_pages(gfp, order, nid, NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2241,6 +2376,13 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2251,10 +2393,10 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + if (pol->mode == MPOL_PREFERRED) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); @@ -2306,7 +2448,7 @@ EXPORT_SYMBOL(alloc_pages_vma); */ struct page *alloc_pages(gfp_t gfp, unsigned order) { - struct mempolicy *pol = &default_policy; + struct mempolicy *pol = &default_policy.policy; struct page *page; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) @@ -2318,6 +2460,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2352,17 +2497,22 @@ int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + struct mempolicy_wrapper *old_wrapper, *new_wrapper; if (!new) return ERR_PTR(-ENOMEM); + old_wrapper = container_of(old, struct mempolicy_wrapper, policy); + new_wrapper = container_of(new, struct mempolicy_wrapper, policy); + /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); - *new = *old; + *new_wrapper = *old_wrapper; task_unlock(current); - } else - *new = *old; + } else { + *new_wrapper = *old_wrapper; + } if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); @@ -2375,12 +2525,19 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { + struct mempolicy_wrapper *wrapper_a, *wrapper_b; + + wrapper_a = container_of(a, struct mempolicy_wrapper, policy); + wrapper_b = container_of(b, struct mempolicy_wrapper, policy); + if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; + if (wrapper_a->home_node != wrapper_b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; @@ -2390,10 +2547,10 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - /* a's ->flags is the same as b's */ - if (a->flags & MPOL_F_LOCAL) - return true; + case MPOL_PREFERRED_MANY: return a->v.preferred_node == b->v.preferred_node; + case MPOL_LOCAL: + return true; default: BUG(); return false; @@ -2531,16 +2688,19 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; + if (node_isset(curnid, pol->v.nodes)) + goto out; + polnid = pol->v.preferred_node; + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); break; case MPOL_BIND: + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2840,7 +3000,7 @@ void __init numa_policy_init(void) int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", - sizeof(struct mempolicy), + sizeof(struct mempolicy_wrapper), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", @@ -2848,7 +3008,7 @@ void __init numa_policy_init(void) 0, SLAB_PANIC, NULL); for_each_node(nid) { - preferred_node_policy[nid] = (struct mempolicy) { + preferred_node_policy[nid].policy = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, @@ -2900,9 +3060,6 @@ void numa_default_policy(void) * Parse and format mempolicy from/to strings */ -/* - * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. - */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2910,6 +3067,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2980,7 +3138,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) */ if (nodelist) goto out; - mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* @@ -2989,6 +3146,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -3024,7 +3182,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) else if (nodelist) new->v.preferred_node = first_node(nodes); else - new->flags |= MPOL_F_LOCAL; + new->mode = MPOL_LOCAL; /* * Save nodes for contextualization: this will be used to "clone" @@ -3063,20 +3221,20 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && pol != &default_policy.policy && + !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: + case MPOL_LOCAL: break; case MPOL_PREFERRED: - if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; - else - node_set(pol->v.preferred_node, nodes); + node_set(pol->v.preferred_node, nodes); break; + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->v.nodes; diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index ddd5d28b5b7fa5c4d729c0877a0511e0ca744753..7b3a3c84cac1684e9f53d5c32189a295357b5582 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -878,8 +878,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 @@ -893,6 +893,9 @@ __SYSCALL(__NR_kabi_reserved455, sys_ni_syscall) #define __NR_kabi_reserved456 456 __SYSCALL(__NR_kabi_reserved456, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls #define __NR_syscalls 457 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index c68c1111c155d488406bde05c1f07a408fc3c697..af58772e41b7ea579c459207051d1f635c6c7bcf 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -539,7 +539,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index d2fa9647ce252ee1ac3c2878d4af46a440c82806..fe015307e16385d0a449d73b6e587149e9204f09 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -388,3 +388,4 @@ 378 common s390_guarded_storage sys_s390_guarded_storage compat_sys_s390_guarded_storage 379 common statx sys_statx compat_sys_statx 380 common s390_sthyi sys_s390_sthyi compat_sys_s390_sthyi +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node \ No newline at end of file diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 8f4ad1695d8f0bc7109e47d7e40f92d24022f0c2..e52491d39345acb8c9beab6b446156ebabf055e9 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,7 +371,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall