diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 87abc7b033602ba3623f69024b5350da842932ba..eeb0b9cc0bee3149ca55edd220b25f814a9971dc 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 61cd225fcaa4767dad613bae421e2b6babab678e..00437067f14d08b6fe4ce3d06a86f9a37b595bb1 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ /* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 8516789076400db162bd1cfe4f409dd5343362f0..0bdf4ae5b69f1f4427cada8efdd5727601baa1b8 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -27,6 +27,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_UNINITIALIZED 0 /* uninitialized anonymous mmap */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index f0eb04780148ae240443fa58c0a149a720b59718..908fa2ad02cc8300f202c65afb846baeaef56851 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ /* Override any generic PKEY permission defines */ #define PKEY_DISABLE_EXECUTE 0x4 diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 8caf19c604d0aec2eee30ae806b374ef06006612..06578c16a683781ccca9ba6fde2d5824f310c2fb 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -22,5 +22,6 @@ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ #endif /* _UAPI__SPARC_MMAN_H__ */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index a52ac8462b7da6f8ef98a21ebef9d955fc8a9ca2..717561c7e85a88ec0c66564e0ccfe9b748817103 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 96c5f4c5ee6e4f682805060ea8ea4aa4161143f9..2e2e4983f1ba70f98ff60b4bb4c44b5fabab8106 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -118,6 +118,45 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Check current numa node has enough free huge pages to mmap hugetlb. + * resv_huge_pages_node: mmap hugepages but haven't used in current + * numa node. + */ +static int hugetlb_checknode(struct vm_area_struct *vma, long nr) +{ + int nid; + int ret = 0; + struct hstate *h = &default_hstate; + + spin_lock(&hugetlb_lock); + + nid = vma->vm_flags >> CHECKNODE_BITS; + + if (nid >= MAX_NUMNODES) { + ret = -EINVAL; + goto err; + } + + if (h->free_huge_pages_node[nid] < nr) { + ret = -ENOMEM; + goto err; + } else { + if (h->resv_huge_pages_node[nid] + nr > + h->free_huge_pages_node[nid]) { + ret = -ENOMEM; + goto err; + } else { + h->resv_huge_pages_node[nid] += nr; + ret = 0; + } + } + +err: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -175,6 +214,12 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) inode_lock(inode); file_accessed(file); + if (is_set_cdmmask() && (vma->vm_flags & VM_CHECKNODE)) { + ret = hugetlb_checknode(vma, len >> huge_page_shift(h)); + if (ret < 0) + goto out; + } + ret = -ENOMEM; if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bf734fb00a1d1c16c134d5dfecca12b28d8781ef..fd9635a6a92ffcaf04590a6398b8b0ac4ec1ba46 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -581,6 +581,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP unsigned int nr_free_vmemmap_pages; #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index ae9b6688677f058c3bf24accb5c888d5233fea32..100c113e62a749614f838be6ebc5fd09df9333ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,6 +97,15 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif +#ifdef CONFIG_COHERENT_DEVICE +static inline bool is_set_cdmmask(void) +{ + return !nodes_empty(cdmmask); +} +#else +#define is_set_cdmmask() (0) +#endif + #include #include @@ -304,6 +313,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_CDM 0x100000000 /* Contains coherent device memory */ #endif +#define VM_CHECKNODE 0x200000000 + #ifdef CONFIG_USERSWAP /* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */ #define VM_USWAP 0x2000000000000000 diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba671682408ccdfe53a0a0726dcebd..7908bf3e5696761104d8b561f00988fcdbc7dec6 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -8,6 +8,21 @@ #include #include +#ifdef CONFIG_COHERENT_DEVICE +#define CHECKNODE_BITS 48 +#define CHECKNODE_MASK (~((_AC(1, UL) << CHECKNODE_BITS) - 1)) +static inline void set_vm_checknode(vm_flags_t *vm_flags, unsigned long flags) +{ + if (is_set_cdmmask()) + *vm_flags |= VM_CHECKNODE | ((((flags >> MAP_HUGE_SHIFT) & + MAP_HUGE_MASK) << CHECKNODE_BITS) & CHECKNODE_MASK); +} +#else +#define CHECKNODE_BITS (0) +static inline void set_vm_checknode(vm_flags_t *vm_flags, unsigned long flags) +{} +#endif + /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 344bb9b090a701ed195d1ecc20390e79e1add9d3..d7f0f48117b03a64135974a86fd755438877915e 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -5,6 +5,7 @@ #include #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_CHECKNODE 0x0400 /* hugetlb numa node check */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ #define MAP_LOCKED 0x2000 /* pages are locked */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6ae2d2e90681e453728819cd5ecc8a95eacd3d5c..d0672e4828794be4f098b70c953a61a54e96dced 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1164,6 +1165,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; + if (is_set_cdmmask() && (vma->vm_flags & VM_CHECKNODE)) + h->resv_huge_pages_node[vma->vm_flags >> CHECKNODE_BITS]--; } mpol_cond_put(mpol); diff --git a/mm/mmap.c b/mm/mmap.c index f705137fd2487ed57437431cdea2d2aa8fc7aa5e..a208057be6f12e145120dff77d8d13da5085edc4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1581,6 +1581,12 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, vm_flags |= VM_NORESERVE; } + /* set numa node id into vm_flags, + * hugetlbfs file mmap will use it to check node + */ + if (flags & MAP_CHECKNODE) + set_vm_checknode(&vm_flags, flags); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1825,12 +1831,23 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs; + int page_size_log; - hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + /* + * If config cdm node, flags bits [26:31] used for + * mmap hugetlb check node + */ + if (is_set_cdmmask()) + page_size_log = 0; + else + page_size_log = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + + hs = hstate_sizelog(page_size_log); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); + /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called @@ -1839,8 +1856,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, - (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + &user, HUGETLB_ANONHUGE_INODE, page_size_log); if (IS_ERR(file)) return PTR_ERR(file); }