提交 52469b4f 编写于 作者: L Linus Torvalds

Merge branch 'core-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull NUMA balancing memory corruption fixes from Ingo Molnar:
 "So these fixes are definitely not something I'd like to sit on, but as
  I said to Mel at the KS the timing is quite tight, with Linus planning
  v3.12-final within a week.

  Fedora-19 is affected:

   comet:~> grep NUMA_BALANCING /boot/config-3.11.3-201.fc19.x86_64

   CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y
   CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
   CONFIG_NUMA_BALANCING=y

  AFAICS Ubuntu will be affected as well, once it updates the kernel:

   hubble:~> grep NUMA_BALANCING /boot/config-3.8.0-32-generic

   CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y
   CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
   CONFIG_NUMA_BALANCING=y

  These 6 commits are a minimalized set of cherry-picks needed to fix
  the memory corruption bugs.  All commits are fixes, except "mm: numa:
  Sanitize task_numa_fault() callsites" which is a cleanup that made two
  followup fixes simpler.

  I've done targeted testing with just this SHA1 to try to make sure
  there are no cherry-picking artifacts.  The original non-cherry-picked
  set of fixes were exposed to linux-next for a couple of weeks"

* 'core-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  mm: Account for a THP NUMA hinting update as one PTE update
  mm: Close races between THP migration and PMD numa clearing
  mm: numa: Sanitize task_numa_fault() callsites
  mm: Prevent parallel splits during THP migration
  mm: Wait for THP migrations to complete during NUMA hinting faults
  mm: numa: Do not account for a hinting fault if we raced
...@@ -1278,64 +1278,90 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1278,64 +1278,90 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp) unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{ {
struct anon_vma *anon_vma = NULL;
struct page *page; struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long haddr = addr & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid; int target_nid;
int current_nid = -1; bool page_locked;
bool migrated; bool migrated = false;
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock; goto out_unlock;
page = pmd_page(pmd); page = pmd_page(pmd);
get_page(page); page_nid = page_to_nid(page);
current_nid = page_to_nid(page);
count_vm_numa_event(NUMA_HINT_FAULTS); count_vm_numa_event(NUMA_HINT_FAULTS);
if (current_nid == numa_node_id()) if (page_nid == this_nid)
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr); target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) { if (target_nid == -1) {
put_page(page); /* If the page was locked, there are no parallel migrations */
goto clear_pmdnuma; if (page_locked)
goto clear_pmdnuma;
/*
* Otherwise wait for potential migrations and retry. We do
* relock and check_same as the page may no longer be mapped.
* As the fault is being retried, do not account for it.
*/
spin_unlock(&mm->page_table_lock);
wait_on_page_locked(page);
page_nid = -1;
goto out;
} }
/* Acquire the page lock to serialise THP migrations */ /* Page is misplaced, serialise migrations and parallel THP splits */
get_page(page);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
lock_page(page); if (!page_locked)
lock_page(page);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PTE did not while locked */ /* Confirm the PTE did not while locked */
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) { if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
page_nid = -1;
goto out_unlock; goto out_unlock;
} }
spin_unlock(&mm->page_table_lock);
/* Migrate the THP to the requested node */ /*
* Migrate the THP to the requested node, returns with page unlocked
* and pmd_numa cleared.
*/
spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma, migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid); pmdp, pmd, addr, page, target_nid);
if (!migrated) if (migrated)
goto check_same; page_nid = target_nid;
task_numa_fault(target_nid, HPAGE_PMD_NR, true);
return 0;
check_same: goto out;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
clear_pmdnuma: clear_pmdnuma:
BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd); pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd); set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp)); VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp); update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock: out_unlock:
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
if (current_nid != -1)
task_numa_fault(current_nid, HPAGE_PMD_NR, false); out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
return 0; return 0;
} }
......
...@@ -3521,12 +3521,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3521,12 +3521,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} }
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int current_nid) unsigned long addr, int page_nid)
{ {
get_page(page); get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS); count_vm_numa_event(NUMA_HINT_FAULTS);
if (current_nid == numa_node_id()) if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
return mpol_misplaced(page, vma, addr); return mpol_misplaced(page, vma, addr);
...@@ -3537,7 +3537,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3537,7 +3537,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{ {
struct page *page = NULL; struct page *page = NULL;
spinlock_t *ptl; spinlock_t *ptl;
int current_nid = -1; int page_nid = -1;
int target_nid; int target_nid;
bool migrated = false; bool migrated = false;
...@@ -3567,15 +3567,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3567,15 +3567,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0; return 0;
} }
current_nid = page_to_nid(page); page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr, current_nid); target_nid = numa_migrate_prep(page, vma, addr, page_nid);
pte_unmap_unlock(ptep, ptl); pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) { if (target_nid == -1) {
/*
* Account for the fault against the current node if it not
* being replaced regardless of where the page is located.
*/
current_nid = numa_node_id();
put_page(page); put_page(page);
goto out; goto out;
} }
...@@ -3583,11 +3578,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3583,11 +3578,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migrate to the requested node */ /* Migrate to the requested node */
migrated = migrate_misplaced_page(page, target_nid); migrated = migrate_misplaced_page(page, target_nid);
if (migrated) if (migrated)
current_nid = target_nid; page_nid = target_nid;
out: out:
if (current_nid != -1) if (page_nid != -1)
task_numa_fault(current_nid, 1, migrated); task_numa_fault(page_nid, 1, migrated);
return 0; return 0;
} }
...@@ -3602,7 +3597,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3602,7 +3597,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long offset; unsigned long offset;
spinlock_t *ptl; spinlock_t *ptl;
bool numa = false; bool numa = false;
int local_nid = numa_node_id();
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
pmd = *pmdp; pmd = *pmdp;
...@@ -3625,9 +3619,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3625,9 +3619,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte; pte_t pteval = *pte;
struct page *page; struct page *page;
int curr_nid = local_nid; int page_nid = -1;
int target_nid; int target_nid;
bool migrated; bool migrated = false;
if (!pte_present(pteval)) if (!pte_present(pteval))
continue; continue;
if (!pte_numa(pteval)) if (!pte_numa(pteval))
...@@ -3649,25 +3644,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3649,25 +3644,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(page_mapcount(page) != 1)) if (unlikely(page_mapcount(page) != 1))
continue; continue;
/* page_nid = page_to_nid(page);
* Note that the NUMA fault is later accounted to either target_nid = numa_migrate_prep(page, vma, addr, page_nid);
* the node that is currently running or where the page is pte_unmap_unlock(pte, ptl);
* migrated to. if (target_nid != -1) {
*/ migrated = migrate_misplaced_page(page, target_nid);
curr_nid = local_nid; if (migrated)
target_nid = numa_migrate_prep(page, vma, addr, page_nid = target_nid;
page_to_nid(page)); } else {
if (target_nid == -1) {
put_page(page); put_page(page);
continue;
} }
/* Migrate to the requested node */ if (page_nid != -1)
pte_unmap_unlock(pte, ptl); task_numa_fault(page_nid, 1, migrated);
migrated = migrate_misplaced_page(page, target_nid);
if (migrated)
curr_nid = target_nid;
task_numa_fault(curr_nid, 1, migrated);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
} }
......
...@@ -1715,12 +1715,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1715,12 +1715,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unlock_page(new_page); unlock_page(new_page);
put_page(new_page); /* Free it */ put_page(new_page); /* Free it */
unlock_page(page); /* Retake the callers reference and putback on LRU */
get_page(page);
putback_lru_page(page); putback_lru_page(page);
mod_zone_page_state(page_zone(page),
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
isolated = 0; goto out_fail;
goto out;
} }
/* /*
...@@ -1737,9 +1737,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1737,9 +1737,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry); entry = pmd_mkhuge(entry);
page_add_new_anon_rmap(new_page, vma, haddr); pmdp_clear_flush(vma, haddr, pmd);
set_pmd_at(mm, haddr, pmd, entry); set_pmd_at(mm, haddr, pmd, entry);
page_add_new_anon_rmap(new_page, vma, haddr);
update_mmu_cache_pmd(vma, address, &entry); update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page); page_remove_rmap(page);
/* /*
...@@ -1758,7 +1758,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1758,7 +1758,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
out:
mod_zone_page_state(page_zone(page), mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru, NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR); -HPAGE_PMD_NR);
...@@ -1767,6 +1766,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1767,6 +1766,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail: out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref: out_dropref:
entry = pmd_mknonnuma(entry);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
return 0; return 0;
......
...@@ -148,7 +148,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, ...@@ -148,7 +148,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
split_huge_page_pmd(vma, addr, pmd); split_huge_page_pmd(vma, addr, pmd);
else if (change_huge_pmd(vma, pmd, addr, newprot, else if (change_huge_pmd(vma, pmd, addr, newprot,
prot_numa)) { prot_numa)) {
pages += HPAGE_PMD_NR; pages++;
continue; continue;
} }
/* fall through */ /* fall through */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册