diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a84bef7aa8640f88ce86f3b02a4aa2a03ff198ed..b508acfdde2e27bccda57d147934baca3ed51d8d 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -65,6 +65,7 @@ Currently, these files are in /proc/sys/vm: - page-cluster - panic_on_oom - percpu_pagelist_fraction +- percpu_max_batchsize - stat_interval - stat_refresh - numa_stat @@ -856,6 +857,15 @@ the high water marks for each per cpu page list. If the user writes '0' to this sysctl, it will revert to this default behavior. +percpu_max_batchsize +======================== + +This is used to setup the max batch and high size of percpu in each zone. +The default value is set to (256 * 1024) / PAGE_SIZE. +The max value is limited to (512 * 1024) / PAGE_SIZE. +The min value is limited to (64 * 1024) / PAGE_SIZE. + + stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f25539d2fe41466282afd24dec36bfb3398923e..0a70b4bdd23650f8460b8ab88493ed4723256033 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1009,6 +1009,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int percpu_max_batchsize_sysctl_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, @@ -1016,6 +1018,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; +extern int percpu_max_batchsize; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d290ba2d8fee8b01ad66fdc3767e3d06bf6e2473..0a25c33e80a72c744d8250984bf2cc127cf7fb99 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2993,6 +2993,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "percpu_max_batchsize", + .data = &percpu_max_batchsize, + .maxlen = sizeof(percpu_max_batchsize), + .mode = 0644, + .proc_handler = percpu_max_batchsize_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 274b68a147eaf9f7a6fb261aa656df619943badd..d58ddd6e7f737274a622750da9ce48621b502f51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -112,6 +112,8 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) +#define MAX_PERCPU_MAX_BATCHSIZE ((512 * 1024) / PAGE_SIZE) +#define MIN_PERCPU_MAX_BATCHSIZE (MAX_PERCPU_MAX_BATCHSIZE / 8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -167,6 +169,8 @@ unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; +int percpu_max_batchsize = MAX_PERCPU_MAX_BATCHSIZE / 2; + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON DEFINE_STATIC_KEY_TRUE(init_on_alloc); @@ -6757,10 +6761,9 @@ static int zone_batchsize(struct zone *zone) * size of the zone. */ batch = zone_managed_pages(zone) / 1024; - /* But no more than a meg. */ - if (batch * PAGE_SIZE > 1024 * 1024) - batch = (1024 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ + if (batch > percpu_max_batchsize) + batch = percpu_max_batchsize; if (batch < 1) batch = 1; @@ -8615,6 +8618,39 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, return ret; } +int percpu_max_batchsize_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_max_batchsize; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_max_batchsize = percpu_max_batchsize; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_max_batchsize > MAX_PERCPU_MAX_BATCHSIZE || + percpu_max_batchsize < MIN_PERCPU_MAX_BATCHSIZE) { + percpu_max_batchsize = old_percpu_max_batchsize; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_max_batchsize == old_percpu_max_batchsize) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but diff --git a/mm/zswap.c b/mm/zswap.c index 358f48b173dc05f76b9f83f6b6e5e4fad6614282..d8e8d0084a22e87ff3b6927f5a78655d64b37811 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -79,6 +79,8 @@ static bool zswap_pool_reached_full; #define ZSWAP_PARAM_UNSET "" +static int zswap_setup(void); + /* Enable/disable zswap */ static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, @@ -203,11 +205,14 @@ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); -/* used by param callback function */ -static bool zswap_init_started; +#define ZSWAP_UNINIT 0 +#define ZSWAP_INIT_SUCCEED 1 +#define ZSWAP_INIT_FAILED 2 -/* fatal error during init */ -static bool zswap_init_failed; +/* init state */ +static int zswap_init_state; +/* used to ensure the integrity of initialization */ +static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; @@ -261,13 +266,13 @@ static void zswap_update_total_size(void) **********************************/ static struct kmem_cache *zswap_entry_cache; -static int __init zswap_entry_cache_create(void) +static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); return zswap_entry_cache == NULL; } -static void __init zswap_entry_cache_destroy(void) +static void zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } @@ -648,7 +653,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } -static __init struct zswap_pool *__zswap_pool_create_fallback(void) +static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; @@ -757,7 +762,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *s = strstrip((char *)val); int ret; - if (zswap_init_failed) { + if (zswap_init_state == ZSWAP_INIT_FAILED) { pr_err("can't set param, initialization failed\n"); return -ENODEV; } @@ -766,11 +771,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return 0; - /* if this is load-time (pre-init) param setting, + /* + * if zswap has not been initialized, * don't create a pool; that's done during init. */ - if (!zswap_init_started) - return param_set_charp(s, kp); + mutex_lock(&zswap_init_lock); + if (zswap_init_state == ZSWAP_UNINIT) { + ret = param_set_charp(s, kp); + mutex_unlock(&zswap_init_lock); + return ret; + } + mutex_unlock(&zswap_init_lock); if (!type) { if (!zpool_has_pool(s)) { @@ -860,11 +871,19 @@ static int zswap_zpool_param_set(const char *val, static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { - if (zswap_init_failed) { + if (system_state == SYSTEM_RUNNING) { + mutex_lock(&zswap_init_lock); + if (zswap_setup()) { + mutex_unlock(&zswap_init_lock); + return -ENODEV; + } + mutex_unlock(&zswap_init_lock); + } + if (zswap_init_state == ZSWAP_INIT_FAILED) { pr_err("can't enable, initialization failed\n"); return -ENODEV; } - if (!zswap_has_pool && zswap_init_started) { + if (!zswap_has_pool && zswap_init_state == ZSWAP_INIT_SUCCEED) { pr_err("can't enable, no pool configured\n"); return -ENODEV; } @@ -1390,7 +1409,7 @@ static struct frontswap_ops zswap_frontswap_ops = { static struct dentry *zswap_debugfs_root; -static int __init zswap_debugfs_init(void) +static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; @@ -1426,7 +1445,7 @@ static void __exit zswap_debugfs_exit(void) debugfs_remove_recursive(zswap_debugfs_root); } #else -static int __init zswap_debugfs_init(void) +static int zswap_debugfs_init(void) { return 0; } @@ -1434,15 +1453,13 @@ static int __init zswap_debugfs_init(void) static void __exit zswap_debugfs_exit(void) { } #endif -/********************************* -* module init and exit -**********************************/ -static int __init init_zswap(void) +static int zswap_setup(void) { struct zswap_pool *pool; int ret; - zswap_init_started = true; + if (zswap_init_state != ZSWAP_UNINIT) + return 0; if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -1481,6 +1498,7 @@ static int __init init_zswap(void) frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); + zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; fallback_fail: @@ -1492,10 +1510,22 @@ static int __init init_zswap(void) zswap_entry_cache_destroy(); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ - zswap_init_failed = true; + zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + /* skip init if zswap is disabled when system startup */ + if (!zswap_enabled) + return 0; + return zswap_setup(); +} + /* must be late so crypto has time to come up */ late_initcall(init_zswap);