diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0a5d5fb18854200ebfb0704473293dc28ba6925e..99f89aa101697d553f60602022694c945de9dd80 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package. Manual migration is useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator may detect the situation and move the pages of the process -nearer to the new processor. At some point in the future we may have -some mechanism in the scheduler that will automatically move the pages. +nearer to the new processor. The kernel itself does only provide +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +A NUMA profiler may f.e. obtain a log showing frequent off node +accesses and may use the result to move pages to more advantageous +locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to @@ -62,22 +67,14 @@ A. In kernel use of migrate_pages() It also prevents the swapper or other scans to encounter the page. -2. Generate a list of newly allocates pages. These pages will contain the - contents of the pages from the first list after page migration is - complete. +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. 3. The migrate_pages() function is called which attempts - to do the migration. It returns the moved pages in the - list specified as the third parameter and the failed - migrations in the fourth parameter. When the function - returns the first list will contain the pages that could still be retried. - -4. The leftover pages of various types are returned - to the LRU using putback_to_lru_pages() or otherwise - disposed of. The pages will still have the refcount as - increased by isolate_lru_pages() if putback_to_lru_pages() is not - used! The kernel may want to handle the various cases of failures in - different ways. + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. B. How migrate_pages() works ---------------------------- diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca5cf40ef64397166dd2296d38f9af05bda..32c999f58d129486d638299a0dce3d06715f4538 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1584,7 +1584,7 @@ sys_call_table: data8 sys_keyctl data8 sys_ioprio_set data8 sys_ioprio_get // 1275 - data8 sys_ni_syscall + data8 sys_move_pages data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 632f2eedf72c9fab463c74ce682485a576e7c077..bb0eb727dcd086fb15d43135ca7e0e20ebb4cdf8 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -265,7 +265,7 @@ #define __NR_keyctl 1273 #define __NR_ioprio_set 1274 #define __NR_ioprio_get 1275 -/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ +#define __NR_move_pages 1276 #define __NR_inotify_init 1277 #define __NR_inotify_add_watch 1278 #define __NR_inotify_rm_watch 1279 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5b95d6568dc4ad83f5103d20462c632bc4d9b77f..5dba23a1c0d0e0d75dc0bb465aaba2836bca5bcc 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,7 +3,7 @@ #include -typedef struct page *new_page_t(struct page *, unsigned long private); +typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bd67a4413df7b29389f3729073becd4862440605..7e3f23490918cbd38df83a3bbed209fefeaa9c7b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, + int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f195ea8ec71f83fd34c775346c1c68..597229749dec68e1ea6926cf44a6db250271ec1a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e6646b304685226a0dc6e634b5dc35e..05b84acf0bb33ad20ccf9914c23d883a8c42d01c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node) +static struct page *new_node_page(struct page *page, unsigned long node, int **x) { return alloc_pages_node(node, GFP_HIGHUSER, 0); } @@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d158257a39fd87c1aa26f6f13f758048715..033a12f4c9499657aa07b10860484757291cc0d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) } /* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { @@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *page, int force) { int rc = 0; - struct page *newpage = get_new_page(page, private); + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -642,6 +644,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * then this will free the page. */ move_to_lru(newpage); + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } return rc; } @@ -710,3 +718,255 @@ int migrate_pages(struct list_head *from, return nr_failed + retry; } +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + */ +static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + migrate_prep(); + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + else + err = -ENOENT; + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Determine the nodes of a list of pages. The addr in the pm array + * must have been set to the virtual address of which we want to determine + * the node number. + */ +static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +{ + down_read(&mm->mmap_sem); + + for ( ; pm->node != MAX_NUMNODES; pm++) { + struct vm_area_struct *vma; + struct page *page; + int err; + + err = -EFAULT; + vma = find_vma(mm, pm->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pm->addr, 0); + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + pm->status = err; + } + + up_read(&mm->mmap_sem); + return 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + int err = 0; + int i; + struct task_struct *task; + nodemask_t task_nodes; + struct mm_struct *mm; + struct page_to_node *pm = NULL; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out2; + } + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out2; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out2; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out; + + err = -ENODEV; + if (!node_online(node)) + goto out; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out; + + pm[i].node = node; + } + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + if (nodes) + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); + else + err = do_pages_stat(mm, pm); + + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out: + vfree(pm); +out2: + mmput(mm); + return err; +} +#endif +