diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 886c900323f145d05913c300c4309dcfac6dd4de..cc5eec8959a07f8f4a0782f8796931adee8b1ef9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1945,7 +1945,7 @@ keepinitrd [HW,ARM] kernelcore= [KNL,X86,IA-64,PPC,ARM64] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -1969,6 +1969,9 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. + Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f78d1e89593fd91275e4bba7625c40e2ecd40267..152cb9bdf43659e399bb8e25b5ad1508c2c7917c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,6 +45,12 @@ struct vm_area_struct; #define ___GFP_NOLOCKDEP 0 #endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ +#ifdef CONFIG_MEMORY_RELIABLE +/* add flag at the end of gfp_mask to aovid kapi change */ +#define ___GFP_RELIABILITY 0x40000000u +#else +#define ___GFP_RELIABILITY 0 +#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -446,6 +452,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && flags & ___GFP_RELIABILITY) + return ZONE_NORMAL; +#endif + return z; } diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 0000000000000000000000000000000000000000..b03108441e37a37f4b834c0ca717e0ff7c1e4227 --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#include +#include +#include +#include +#include + + +#ifdef CONFIG_MEMORY_RELIABLE + +extern struct static_key_false mem_reliable; + +extern bool reliable_enabled; + +extern void add_reliable_mem_size(long sz); +extern void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool zone_reliable(struct zone *zone) +{ + return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; +} + +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & ___GFP_RELIABILITY) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline void add_reliable_mem_size(long sz) {} +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn) {} +static inline bool zone_reliable(struct zone *zone) { return false; } +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} + +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index be0be448c3f1967301f6b8e798b385f8766c94a0..630b103065f4c7b9bb369c5a126fd6998f357f09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,9 @@ #include #include +/* added to mm.h to avoid every caller adding new header file */ +#include + struct mempolicy; struct anon_vma; struct anon_vma_chain; diff --git a/mm/Kconfig b/mm/Kconfig index 12601505c4a4ae99bee1b3592c055c85ff7ad3b7..80d7b47ca9f53d9f7f9aa2d8a376a8349c3b7367 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -831,4 +831,22 @@ config PID_RESERVE We record the pid of dump task in the reserve memory, and reserve the pids before init task start. In restore process, free the reserved pids and realloc them for use. + +config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, tmpfs and pagecache. + Special user tasks and tmpfs/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 8fba091be3868be0345b7df29a303d6813bf883f..741f9c250914cf54213308de85755fe4a41deba0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,3 +109,4 @@ obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 0000000000000000000000000000000000000000..2e21839ca49fb4dfbb5b2d373f36489795190ea7 --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + + +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(mem_reliable); + +bool reliable_enabled; + +static atomic_long_t total_reliable_mem; + +void add_reliable_mem_size(long sz) +{ + atomic_long_add(sz, &total_reliable_mem); +} + +static int reliable_mem_notifier(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct memory_notify *m_arg = arg; + struct zone *zone; + + switch (action) { + case MEM_ONLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); + break; + case MEM_OFFLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block reliable_notifier_block = { + .notifier_call = reliable_mem_notifier, +}; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +{ + if (!reliable_enabled) + return; + + if (atomic_long_read(&total_reliable_mem) == 0) { + memset(zone_movable_pfn, 0, + sizeof(unsigned long) * MAX_NUMNODES); + + pr_err("init failed, mirrored memory size is zero."); + + return; + } + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero."); + + return; + } + + if (register_hotmemory_notifier(&reliable_notifier_block)) { + pr_err("init failed, register memory notifier failed."); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)", + atomic_long_read(&total_reliable_mem)); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4cad86f1e3a91c65c3d0bbb5979fedf619c9aea7..e1e513e851dec2c7bcf5b0e3be42a6ad95057ca4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3454,6 +3454,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark; + /* skip non-movable zone for normal user tasks */ + if (skip_none_movable_zone(gfp_mask, z)) + continue; + /* * CDM nodes get skipped if the requested gfp flag * does not have __GFP_THISNODE set or the nodemask @@ -4557,6 +4561,18 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); } +static inline void prepare_before_alloc(gfp_t *gfp_mask) +{ + gfp_t gfp_ori = *gfp_mask; + *gfp_mask &= gfp_allowed_mask; + + if (!mem_reliable_is_enabled()) + return; + + if (gfp_ori & ___GFP_RELIABILITY) + *gfp_mask |= ___GFP_RELIABILITY; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4578,7 +4594,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; } - gfp_mask &= gfp_allowed_mask; + prepare_before_alloc(&gfp_mask); + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -6912,10 +6929,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false; for_each_memblock(memory, r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + add_reliable_mem_size(r->size); continue; + } nid = r->nid; @@ -6926,6 +6946,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; } + has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -6934,6 +6955,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory."); + mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + goto out2; } @@ -7226,9 +7249,28 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_info("kernelcore=reliable and kernelcore=mirror are alternative."); + return -EINVAL; + } + + mirrored_kernelcore = true; + return 0; + } + +#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_info("kernelcore=mirror and kernelcore=reliable are alternative."); + return -EINVAL; + } + + reliable_enabled = true; mirrored_kernelcore = true; return 0; } +#endif return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent);