提交 562117a9 编写于 作者: N Nicholas Piggin 提交者: Yang Yingliang

mm/vmalloc: Hugepage vmalloc mappings

ascend inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI
CVE: NA

https://lwn.net/ml/linux-kernel/20200825145753.529284-12-npiggin@gmail.com/

Don't distinction between vmalloc and hugepage vmalloc, because there is no size
print in alloc_large_system_hash in v4.19.

And this patch add page_order in vm_struct, it will break kabi.
--------------

Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
supports PMD sized vmap mappings.

vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or
larger, and fall back to small pages if that was unsuccessful.

Allocations that do not use PAGE_KERNEL prot are not permitted to use huge
pages, because not all callers expect this (e.g., module allocations vs
strict module rwx).

This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.

This can result in more internal fragmentation and memory overhead for a
given allocation, an option nohugevmalloc is added to disable at boot.
Signed-off-by: NNicholas Piggin <npiggin@gmail.com>
Signed-off-by: NRui Xiang <rui.xiang@huawei.com>
Reviewed-by: NDing Tianhong <dingtianhong@huawei.com>
Reviewed-by: NZefan Li <lizefan@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
Reviewed-by: NWeilong Chen <chenweilong@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 3eb01fae
...@@ -559,6 +559,10 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD ...@@ -559,6 +559,10 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
config HAVE_ARCH_HUGE_VMAP config HAVE_ARCH_HUGE_VMAP
bool bool
config HAVE_ARCH_HUGE_VMALLOC
depends on HAVE_ARCH_HUGE_VMAP
bool
config HAVE_ARCH_SOFT_DIRTY config HAVE_ARCH_SOFT_DIRTY
bool bool
......
...@@ -39,6 +39,7 @@ struct vm_struct { ...@@ -39,6 +39,7 @@ struct vm_struct {
unsigned long size; unsigned long size;
unsigned long flags; unsigned long flags;
struct page **pages; struct page **pages;
unsigned int page_order;
unsigned int nr_pages; unsigned int nr_pages;
phys_addr_t phys_addr; phys_addr_t phys_addr;
const void *caller; const void *caller;
......
...@@ -41,6 +41,19 @@ ...@@ -41,6 +41,19 @@
#include "internal.h" #include "internal.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;
static int __init set_nohugevmalloc(char *str)
{
vmap_allow_huge = false;
return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
struct vfree_deferred { struct vfree_deferred {
struct llist_head list; struct llist_head list;
struct work_struct wq; struct work_struct wq;
...@@ -410,6 +423,61 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, ...@@ -410,6 +423,61 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
return 0; return 0;
} }
static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
int err = 0;
int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return 0;
}
static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
WARN_ON(page_shift < PAGE_SHIFT);
if (page_shift == PAGE_SHIFT)
return vmap_small_pages_range_noflush(addr, end, prot, pages);
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
__pa(page_address(pages[i])), prot,
page_shift);
if (err)
return err;
addr += 1UL << page_shift;
}
return 0;
}
static int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
flush_cache_vmap(addr, end);
return err;
}
/** /**
* map_kernel_range_noflush - map kernel VM area with the specified pages * map_kernel_range_noflush - map kernel VM area with the specified pages
* @addr: start of the VM area to map * @addr: start of the VM area to map
...@@ -431,22 +499,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, ...@@ -431,22 +499,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
int map_kernel_range_noflush(unsigned long addr, unsigned long size, int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages) pgprot_t prot, struct page **pages)
{ {
unsigned long end = addr + size; return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
unsigned long next;
pgd_t *pgd;
int err = 0;
int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return 0;
} }
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
...@@ -2270,11 +2323,11 @@ static void __vunmap(const void *addr, int deallocate_pages) ...@@ -2270,11 +2323,11 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) { if (deallocate_pages) {
int i; int i;
for (i = 0; i < area->nr_pages; i++) { for (i = 0; i < area->nr_pages; i += 1U << area->page_order) {
struct page *page = area->pages[i]; struct page *page = area->pages[i];
BUG_ON(!page); BUG_ON(!page);
__free_pages(page, 0); __free_pages(page, area->page_order);
} }
kvfree(area->pages); kvfree(area->pages);
...@@ -2403,9 +2456,12 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, ...@@ -2403,9 +2456,12 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot, gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller); int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node) pgprot_t prot, unsigned int page_shift, int node)
{ {
struct page **pages; struct page **pages;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned int page_order = page_shift - PAGE_SHIFT;
unsigned int nr_pages; unsigned int nr_pages;
unsigned long array_size; unsigned long array_size;
unsigned int i; unsigned int i;
...@@ -2415,7 +2471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -2415,7 +2471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
0 : 0 :
__GFP_HIGHMEM; __GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; nr_pages = size >> PAGE_SHIFT;
array_size = (unsigned long)nr_pages * sizeof(struct page *); array_size = (unsigned long)nr_pages * sizeof(struct page *);
/* Please note that the recursion is strictly bounded. */ /* Please note that the recursion is strictly bounded. */
...@@ -2434,27 +2490,27 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -2434,27 +2490,27 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages; area->pages = pages;
area->nr_pages = nr_pages; area->nr_pages = nr_pages;
area->page_order = page_order;
for (i = 0; i < area->nr_pages; i++) { for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page; struct page *page;
int p;
if (node == NUMA_NO_NODE) page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order);
page = alloc_page(alloc_mask|highmem_mask);
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) { if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */ /* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i; area->nr_pages = i;
goto fail; goto fail;
} }
area->pages[i] = page;
for (p = 0; p < (1U << page_order); p++)
area->pages[i + p] = page + p;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched(); cond_resched();
} }
if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
prot, pages) < 0)
goto fail; goto fail;
return area->addr; return area->addr;
...@@ -2462,7 +2518,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -2462,7 +2518,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
fail: fail:
warn_alloc(gfp_mask, NULL, warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes", "vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size); (area->nr_pages*PAGE_SIZE), size);
vfree(area->addr); vfree(area->addr);
return NULL; return NULL;
} }
...@@ -2491,19 +2547,42 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, ...@@ -2491,19 +2547,42 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area; struct vm_struct *area;
void *addr; void *addr;
unsigned long real_size = size; unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages) if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail; goto fail;
if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) {
unsigned long size_per_node;
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
* their allocations due to apply_to_page_range not
* supporting them.
*/
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
if (size_per_node >= PMD_SIZE) {
shift = PMD_SHIFT;
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
}
}
again:
size = PAGE_ALIGN(size);
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller); vm_flags, start, end, node, gfp_mask, caller);
if (!area) if (!area)
goto fail; goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node); addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr) if (!addr)
return NULL; goto fail;
/* /*
* First make sure the mappings are removed from all page-tables * First make sure the mappings are removed from all page-tables
...@@ -2523,8 +2602,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, ...@@ -2523,8 +2602,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr; return addr;
fail: fail:
warn_alloc(gfp_mask, NULL, if (shift > PAGE_SHIFT) {
free_vm_area(area);
shift = PAGE_SHIFT;
align = real_align;
size = real_size;
goto again;
}
if (!area) {
/* Warn for area allocation, page allocations already warn */
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size); "vmalloc: allocation failure: %lu bytes", real_size);
}
return NULL; return NULL;
} }
...@@ -3503,7 +3593,7 @@ static int s_show(struct seq_file *m, void *p) ...@@ -3503,7 +3593,7 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, " %pS", v->caller); seq_printf(m, " %pS", v->caller);
if (v->nr_pages) if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages); seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order);
if (v->phys_addr) if (v->phys_addr)
seq_printf(m, " phys=%pa", &v->phys_addr); seq_printf(m, " phys=%pa", &v->phys_addr);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册