diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dd74e46828c0fc243740b61a18c2dea654fafb5e..0596e8e0cc1992b1fc32a00277a4f879cd07a8f3 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_numa(pte)) { + pte_unmap(ptep); + return 0; + } + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; @@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33a5dc492810d59eae0c7069e60dd3bdd9c8a374..51f069303ab9e956f85187f42c141d629612f9b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1243,6 +1243,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto out; + page = pmd_page(*pmd); VM_BUG_ON(!PageHead(page)); if (flags & FOLL_TOUCH) { @@ -1323,23 +1327,27 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; + } - /* - * Otherwise wait for potential migrations and retry. We do - * relock and check_same as the page may no longer be mapped. - * As the fault is being retried, do not account for it. - */ + /* + * If there are potential migrations, wait for completion and retry. We + * do not relock and check_same as the page may no longer be mapped. + * Furtermore, even if the page is currently misplaced, there is no + * guarantee it is still misplaced after the migration completes. + */ + if (!page_locked) { spin_unlock(ptl); wait_on_page_locked(page); page_nid = -1; goto out; } - /* Page is misplaced, serialise migrations and parallel THP splits */ + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ get_page(page); spin_unlock(ptl); - if (!page_locked) - lock_page(page); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ diff --git a/mm/migrate.c b/mm/migrate.c index bb940045fe8595842ed58f2e32f87b83d40485e1..2cabbd5fa5bf4806e3f75b27119aff8cfc65f12b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1722,6 +1722,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct page *new_page = NULL; struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); + pmd_t orig_entry; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1756,7 +1757,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Recheck the target PMD */ ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry))) { + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { +fail_putback: spin_unlock(ptl); /* Reverse changes made by migrate_page_copy() */ @@ -1786,16 +1788,34 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, */ mem_cgroup_prepare_migration(page, new_page, &memcg); + orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mknonnuma(entry); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + /* + * Clear the old entry under pagetable lock and establish the new PTE. + * Any parallel GUP will either observe the old page blocking on the + * page lock, block on the page table lock or observe the new page. + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ + flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + page_add_new_anon_rmap(new_page, vma, haddr); pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); - page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { + set_pmd_at(mm, haddr, pmd, orig_entry); + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; + } + page_remove_rmap(page); + /* * Finish the charge transaction under the page table lock to * prevent split_huge_page() from dividing up the charge @@ -1820,9 +1840,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: - entry = pmd_mknonnuma(entry); - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, entry)) { + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(ptl); unlock_page(page); put_page(page);