diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index b392e496f816ac6c77fccd9c0c367de4efa57034..25cc89ba811bb93784fad06669131d71f62e80c4 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -58,6 +58,13 @@ sleep_millisecs - how many milliseconds ksmd should sleep before next scan e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" Default: 20 (chosen for demonstration purposes) +merge_across_nodes - specifies if pages from different numa nodes can be merged. + When set to 0, ksm merges only pages which physically + reside in the memory area of same NUMA node. It brings + lower latency to access to shared page. Value can be + changed only when there is no ksm shared pages in system. + Default: 1 + run - set 0 to stop ksmd from running but keep merged pages, set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", set 2 to stop ksmd and unmerge all pages currently merged, diff --git a/mm/ksm.c b/mm/ksm.c index d3842b206f8af00a7db5d5412d2984a0025e52d4..1602cc9e3d73665f7d49ede7c16cb7ce92d4bdc6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -139,6 +140,9 @@ struct rmap_item { struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ +#ifdef CONFIG_NUMA + unsigned int nid; +#endif union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -153,8 +157,8 @@ struct rmap_item { #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ -static struct rb_root root_stable_tree = RB_ROOT; -static struct rb_root root_unstable_tree = RB_ROOT; +static struct rb_root root_unstable_tree[MAX_NUMNODES]; +static struct rb_root root_stable_tree[MAX_NUMNODES]; #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -188,6 +192,9 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; + #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 @@ -441,10 +448,25 @@ out: page = NULL; return page; } +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ + if (ksm_merge_across_nodes) + return 0; + else + return pfn_to_nid(kpfn); +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; struct hlist_node *hlist; + int nid; hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) @@ -456,7 +478,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) cond_resched(); } - rb_erase(&stable_node->node, &root_stable_tree); + nid = get_kpfn_nid(stable_node->kpfn); + + rb_erase(&stable_node->node, &root_stable_tree[nid]); free_stable_node(stable_node); } @@ -554,7 +578,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) - rb_erase(&rmap_item->node, &root_unstable_tree); +#ifdef CONFIG_NUMA + rb_erase(&rmap_item->node, + &root_unstable_tree[rmap_item->nid]); +#else + rb_erase(&rmap_item->node, &root_unstable_tree[0]); +#endif ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; @@ -990,8 +1019,9 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, */ static struct page *stable_tree_search(struct page *page) { - struct rb_node *node = root_stable_tree.rb_node; + struct rb_node *node; struct stable_node *stable_node; + int nid; stable_node = page_stable_node(page); if (stable_node) { /* ksm page forked */ @@ -999,6 +1029,9 @@ static struct page *stable_tree_search(struct page *page) return page; } + nid = get_kpfn_nid(page_to_pfn(page)); + node = root_stable_tree[nid].rb_node; + while (node) { struct page *tree_page; int ret; @@ -1033,10 +1066,16 @@ static struct page *stable_tree_search(struct page *page) */ static struct stable_node *stable_tree_insert(struct page *kpage) { - struct rb_node **new = &root_stable_tree.rb_node; + int nid; + unsigned long kpfn; + struct rb_node **new; struct rb_node *parent = NULL; struct stable_node *stable_node; + kpfn = page_to_pfn(kpage); + nid = get_kpfn_nid(kpfn); + new = &root_stable_tree[nid].rb_node; + while (*new) { struct page *tree_page; int ret; @@ -1070,11 +1109,11 @@ static struct stable_node *stable_tree_insert(struct page *kpage) return NULL; rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, &root_stable_tree); + rb_insert_color(&stable_node->node, &root_stable_tree[nid]); INIT_HLIST_HEAD(&stable_node->hlist); - stable_node->kpfn = page_to_pfn(kpage); + stable_node->kpfn = kpfn; set_page_stable_node(kpage, stable_node); return stable_node; @@ -1098,10 +1137,15 @@ static struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, struct page *page, struct page **tree_pagep) - { - struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node **new; + struct rb_root *root; struct rb_node *parent = NULL; + int nid; + + nid = get_kpfn_nid(page_to_pfn(page)); + root = &root_unstable_tree[nid]; + new = &root->rb_node; while (*new) { struct rmap_item *tree_rmap_item; @@ -1122,6 +1166,18 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, return NULL; } + /* + * If tree_page has been migrated to another NUMA node, it + * will be flushed out and put into the right unstable tree + * next time: only merge with it if merge_across_nodes. + * Just notice, we don't have similar problem for PageKsm + * because their migration is disabled now. (62b61f611e) + */ + if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { + put_page(tree_page); + return NULL; + } + ret = memcmp_pages(page, tree_page); parent = *new; @@ -1139,8 +1195,11 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); +#ifdef CONFIG_NUMA + rmap_item->nid = nid; +#endif rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &root_unstable_tree); + rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; @@ -1154,6 +1213,13 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, static void stable_tree_append(struct rmap_item *rmap_item, struct stable_node *stable_node) { +#ifdef CONFIG_NUMA + /* + * Usually rmap_item->nid is already set correctly, + * but it may be wrong after switching merge_across_nodes. + */ + rmap_item->nid = get_kpfn_nid(stable_node->kpfn); +#endif rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1283,6 +1349,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + int nid; if (list_empty(&ksm_mm_head.mm_list)) return NULL; @@ -1301,7 +1368,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) */ lru_add_drain_all(); - root_unstable_tree = RB_ROOT; + for (nid = 0; nid < nr_node_ids; nid++) + root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); @@ -1770,15 +1838,19 @@ static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct rb_node *node; + int nid; - for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { - struct stable_node *stable_node; + for (nid = 0; nid < nr_node_ids; nid++) + for (node = rb_first(&root_stable_tree[nid]); node; + node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } - stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) - return stable_node; - } return NULL; } @@ -1925,6 +1997,40 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long knob; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + mutex_lock(&ksm_thread_mutex); + if (ksm_merge_across_nodes != knob) { + if (ksm_pages_shared) + err = -EBUSY; + else + ksm_merge_across_nodes = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1979,6 +2085,9 @@ static struct attribute *ksm_attrs[] = { &pages_unshared_attr.attr, &pages_volatile_attr.attr, &full_scans_attr.attr, +#ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +#endif NULL, }; @@ -1992,11 +2101,15 @@ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; + int nid; err = ksm_slab_init(); if (err) goto out; + for (nid = 0; nid < nr_node_ids; nid++) + root_stable_tree[nid] = RB_ROOT; + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { printk(KERN_ERR "ksm: creating kthread failed\n");