“142dd975911fdd82b1b6f6617cd20ac90a8ccf00”上不存在“arch/x86/include/asm/pgtable-2level.h”
  1. 12 9月, 2022 1 次提交
  2. 29 8月, 2022 1 次提交
  3. 21 8月, 2022 1 次提交
    • D
      mm/hugetlb: support write-faults in shared mappings · 1d8d1464
      David Hildenbrand 提交于
      If we ever get a write-fault on a write-protected page in a shared
      mapping, we'd be in trouble (again).  Instead, we can simply map the page
      writable.
      
      And in fact, there is even a way right now to trigger that code via
      uffd-wp ever since we stared to support it for shmem in 5.19:
      
      --------------------------------------------------------------------------
       #include <stdio.h>
       #include <stdlib.h>
       #include <string.h>
       #include <fcntl.h>
       #include <unistd.h>
       #include <errno.h>
       #include <sys/mman.h>
       #include <sys/syscall.h>
       #include <sys/ioctl.h>
       #include <linux/userfaultfd.h>
      
       #define HUGETLB_SIZE (2 * 1024 * 1024u)
      
       static char *map;
       int uffd;
      
       static int temp_setup_uffd(void)
       {
       	struct uffdio_api uffdio_api;
       	struct uffdio_register uffdio_register;
       	struct uffdio_writeprotect uffd_writeprotect;
       	struct uffdio_range uffd_range;
      
       	uffd = syscall(__NR_userfaultfd,
       		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
       	if (uffd < 0) {
       		fprintf(stderr, "syscall() failed: %d\n", errno);
       		return -errno;
       	}
      
       	uffdio_api.api = UFFD_API;
       	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
       	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
       		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
       		return -errno;
       	}
      
       	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
       		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
       		return -ENOSYS;
       	}
      
       	/* Register UFFD-WP */
       	uffdio_register.range.start = (unsigned long) map;
       	uffdio_register.range.len = HUGETLB_SIZE;
       	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
       	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
       		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
       		return -errno;
       	}
      
       	/* Writeprotect a single page. */
       	uffd_writeprotect.range.start = (unsigned long) map;
       	uffd_writeprotect.range.len = HUGETLB_SIZE;
       	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
       	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
       		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
       		return -errno;
       	}
      
       	/* Unregister UFFD-WP without prior writeunprotection. */
       	uffd_range.start = (unsigned long) map;
       	uffd_range.len = HUGETLB_SIZE;
       	if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
       		fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno);
       		return -errno;
       	}
      
       	return 0;
       }
      
       int main(int argc, char **argv)
       {
       	int fd;
      
       	fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
       	if (!fd) {
       		fprintf(stderr, "open() failed\n");
       		return -errno;
       	}
       	if (ftruncate(fd, HUGETLB_SIZE)) {
       		fprintf(stderr, "ftruncate() failed\n");
       		return -errno;
       	}
      
       	map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
       	if (map == MAP_FAILED) {
       		fprintf(stderr, "mmap() failed\n");
       		return -errno;
       	}
      
       	*map = 0;
      
       	if (temp_setup_uffd())
       		return 1;
      
       	*map = 0;
      
       	return 0;
       }
      --------------------------------------------------------------------------
      
      Above test fails with SIGBUS when there is only a single free hugetlb page.
       # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
       # ./test
       Bus error (core dumped)
      
      And worse, with sufficient free hugetlb pages it will map an anonymous page
      into a shared mapping, for example, messing up accounting during unmap
      and breaking MAP_SHARED semantics:
       # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
       # ./test
       # cat /proc/meminfo | grep HugePages_
       HugePages_Total:       2
       HugePages_Free:        1
       HugePages_Rsvd:    18446744073709551615
       HugePages_Surp:        0
      
      Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
      unregistering and consequently keeps the PTE writeprotected.  Reason for
      this is to avoid the additional overhead when unregistering.  Note that
      this is the case also for !hugetlb and that we will end up with writable
      PTEs that still have the uffd-wp PTE bit set once we return from
      hugetlb_wp().  I'm not touching the uffd-wp PTE bit for now, because it
      seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
      
      VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates
      that MAP_SHARED handling was at least envisioned, but could never have
      worked as expected.
      
      While at it, make sure that we never end up in hugetlb_wp() on write
      faults without VM_WRITE, because we don't support maybe_mkwrite()
      semantics as commonly used in the !hugetlb case -- for example, in
      wp_page_reuse().
      
      Note that there is no need to do any kind of reservation in
      hugetlb_fault() in this case ...  because we already have a hugetlb page
      mapped R/O that we will simply map writable and we are not dealing with
      COW/unsharing.
      
      Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
      Fixes: b1f9e876 ("mm/uffd: enable write protection for shmem & hugetlbfs")
      Signed-off-by: NDavid Hildenbrand <david@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Bjorn Helgaas <bhelgaas@google.com>
      Cc: Cyrill Gorcunov <gorcunov@openvz.org>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jamie Liu <jamieliu@google.com>
      Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Cc: Pavel Emelyanov <xemul@parallels.com>
      Cc: Peter Feiner <pfeiner@google.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: <stable@vger.kernel.org>	[5.19]
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      1d8d1464
  4. 09 8月, 2022 4 次提交
    • N
      mm, hwpoison, hugetlb: support saving mechanism of raw error pages · 161df60e
      Naoya Horiguchi 提交于
      When handling memory error on a hugetlb page, the error handler tries to
      dissolve and turn it into 4kB pages.  If it's successfully dissolved,
      PageHWPoison flag is moved to the raw error page, so that's all right. 
      However, dissolve sometimes fails, then the error page is left as
      hwpoisoned hugepage.  It's useful if we can retry to dissolve it to save
      healthy pages, but that's not possible now because the information about
      where the raw error pages is lost.
      
      Use the private field of a few tail pages to keep that information.  The
      code path of shrinking hugepage pool uses this info to try delayed
      dissolve.  In order to remember multiple errors in a hugepage, a
      singly-linked list originated from SUBPAGE_INDEX_HWPOISON-th tail page is
      constructed.  Only simple operations (adding an entry or clearing all) are
      required and the list is assumed not to be very long, so this simple data
      structure should be enough.
      
      If we failed to save raw error info, the hwpoison hugepage has errors on
      unknown subpage, then this new saving mechanism does not work any more, so
      disable saving new raw error info and freeing hwpoison hugepages.
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-4-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reported-by: Nkernel test robot <lkp@intel.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      161df60e
    • N
      mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry · 3a194f3f
      Naoya Horiguchi 提交于
      follow_pud_mask() does not support non-present pud entry now.  As long as
      I tested on x86_64 server, follow_pud_mask() still simply returns
      no_page_table() for non-present_pud_entry() due to pud_bad(), so no severe
      user-visible effect should happen.  But generally we should call
      follow_huge_pud() for non-present pud entry for 1GB hugetlb page.
      
      Update pud_huge() and follow_huge_pud() to handle non-present pud entries.
      The changes are similar to previous works for pud entries commit
      e66f17ff ("mm/hugetlb: take page table lock in follow_huge_pmd()") and
      commit cbef8478 ("mm/hugetlb: pmd_huge() returns true for non-present
      hugepage").
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-3-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: kernel test robot <lkp@intel.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      3a194f3f
    • N
      mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages() · c0531714
      Naoya Horiguchi 提交于
      Patch series "mm, hwpoison: enable 1GB hugepage support", v7.
      
      
      This patch (of 8):
      
      I found a weird state of 1GB hugepage pool, caused by the following
      procedure:
      
        - run a process reserving all free 1GB hugepages,
        - shrink free 1GB hugepage pool to zero (i.e. writing 0 to
          /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages), then
        - kill the reserving process.
      
      , then all the hugepages are free *and* surplus at the same time.
      
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
        3
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages
        3
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/resv_hugepages
        0
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/surplus_hugepages
        3
      
      This state is resolved by reserving and allocating the pages then freeing
      them again, so this seems not to result in serious problem.  But it's a
      little surprising (shrinking pool suddenly fails).
      
      This behavior is caused by hstate_is_gigantic() check in
      return_unused_surplus_pages().  This was introduced so long ago in 2008 by
      commit aa888a74 ("hugetlb: support larger than MAX_ORDER"), and at
      that time the gigantic pages were not supposed to be allocated/freed at
      run-time.  Now kernel can support runtime allocation/free, so let's check
      gigantic_page_runtime_supported() together.
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-1-naoya.horiguchi@linux.dev
      Link: https://lkml.kernel.org/r/20220714042420.1847125-2-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: kernel test robot <lkp@intel.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      c0531714
    • M
      mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability · 6213834c
      Muchun Song 提交于
      There is a discussion about the name of hugetlb_vmemmap_alloc/free in
      thread [1].  The suggestion suggested by David is rename "alloc/free" to
      "optimize/restore" to make functionalities clearer to users, "optimize"
      means the function will optimize vmemmap pages, while "restore" means
      restoring its vmemmap pages discared before.  This commit does this.
      
      Another discussion is the confusion RESERVE_VMEMMAP_NR isn't used
      explicitly for vmemmap_addr but implicitly for vmemmap_end in
      hugetlb_vmemmap_alloc/free.  David suggested we can compute what
      hugetlb_vmemmap_init() does now at runtime.  We do not need to worry for
      the overhead of computing at runtime since the calculation is simple
      enough and those functions are not in a hot path.  This commit has the
      following improvements:
      
        1) The function suffixed name ("optimize/restore") is more expressive.
        2) The logic becomes less weird in hugetlb_vmemmap_optimize/restore().
        3) The hugetlb_vmemmap_init() does not need to be exported anymore.
        4) A ->optimize_vmemmap_pages field in struct hstate is killed.
        5) There is only one place where checks is_power_of_2(sizeof(struct
           page)) instead of two places.
        6) Add more comments for hugetlb_vmemmap_optimize/restore().
        7) For external users, hugetlb_optimize_vmemmap_pages() is used for
           detecting if the HugeTLB's vmemmap pages is optimizable originally.
           In this commit, it is killed and we introduce a new helper
           hugetlb_vmemmap_optimizable() to replace it.  The name is more
           expressive.
      
      Link: https://lore.kernel.org/all/20220404074652.68024-2-songmuchun@bytedance.com/ [1]
      Link: https://lkml.kernel.org/r/20220628092235.91270-7-songmuchun@bytedance.comSigned-off-by: NMuchun Song <songmuchun@bytedance.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Jonathan Corbet <corbet@lwn.net>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Will Deacon <will@kernel.org>
      Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6213834c
  5. 19 7月, 2022 2 次提交
  6. 18 7月, 2022 4 次提交
    • G
      mm, hugetlb: skip irrelevant nodes in show_free_areas() · dcadcf1c
      Gang Li 提交于
      show_free_areas() allows to filter out node specific data which is
      irrelevant to the allocation request.  But hugetlb_show_meminfo() still
      shows hugetlb on all nodes, which is redundant and unnecessary.
      
      Use show_mem_node_skip() to skip irrelevant nodes.  And replace
      hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid).
      
      before-and-after sample output of OOM:
      
      before:
      ```
      [  214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB
      [  214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
      [  214.388334] lowmem_reserve[]: 0 0 0 0 0
      [  214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20
      [  214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      [  214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      ```
      
      after:
      ```
      [  145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u
      [  145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
      [  145.152315] lowmem_reserve[]: 0 0 0 0 0
      [  145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME)
      [  145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      ```
      
      Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.comSigned-off-by: NGang Li <ligang.bdlg@bytedance.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      dcadcf1c
    • M
      hugetlb: do not update address in huge_pmd_unshare · 4ddb4d91
      Mike Kravetz 提交于
      As an optimization for loops sequentially processing hugetlb address
      ranges, huge_pmd_unshare would update a passed address if it unshared a
      pmd.  Updating a loop control variable outside the loop like this is
      generally a bad idea.  These loops are now using hugetlb_mask_last_page to
      optimize scanning when non-present ptes are discovered.  The same can be
      done when huge_pmd_unshare returns 1 indicating a pmd was unshared.
      
      Remove address update from huge_pmd_unshare.  Change the passed argument
      type and update all callers.  In loops sequentially processing addresses
      use hugetlb_mask_last_page to update address if pmd is unshared.
      
      [sfr@canb.auug.org.au: fix an unused variable warning/error]
        Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au
      Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.comSigned-off-by: NMike Kravetz <mike.kravetz@oracle.com>
      Signed-off-by: NStephen Rothwell <sfr@canb.auug.org.au>
      Acked-by: NMuchun Song <songmuchun@bytedance.com>
      Reviewed-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: James Houghton <jthoughton@google.com>
      Cc: kernel test robot <lkp@intel.com>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Mina Almasry <almasrymina@google.com>
      Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
      Cc: Paul Walmsley <paul.walmsley@sifive.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
      Cc: Will Deacon <will@kernel.org>
      Cc: Stephen Rothwell <sfr@canb.auug.org.au>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      4ddb4d91
    • M
      hugetlb: skip to end of PT page mapping when pte not present · e95a9851
      Mike Kravetz 提交于
      Patch series "hugetlb: speed up linear address scanning", v2.
      
      At unmap, fork and remap time hugetlb address ranges are linearly scanned.
      We can optimize these scans if the ranges are sparsely populated.
      
      Also, enable page table "Lazy copy" for hugetlb at fork.
      
      NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to
      add an arch specific version hugetlb_mask_last_page() to take advantage of
      sparse address scanning improvements.  Baolin Wang added the routine for
      arm64.  Other architectures which could be optimized are: ia64, mips,
      parisc, powerpc, s390, sh and sparc.
      
      
      This patch (of 4):
      
      HugeTLB address ranges are linearly scanned during fork, unmap and remap
      operations.  If a non-present entry is encountered, the code currently
      continues to the next huge page aligned address.  However, a non-present
      entry implies that the page table page for that entry is not present. 
      Therefore, the linear scan can skip to the end of range mapped by the page
      table page.  This can speed operations on large sparsely populated hugetlb
      mappings.
      
      Create a new routine hugetlb_mask_last_page() that will return an address
      mask.  When the mask is ORed with an address, the result will be the
      address of the last huge page mapped by the associated page table page. 
      Use this mask to update addresses in routines which linearly scan hugetlb
      address ranges when a non-present pte is encountered.
      
      hugetlb_mask_last_page is related to the implementation of huge_pte_offset
      as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. 
      This patch only provides a complete hugetlb_mask_last_page implementation
      when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined.  Architectures which
      provide their own versions of huge_pte_offset can also provide their own
      version of hugetlb_mask_last_page.
      
      Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com
      Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.comSigned-off-by: NMike Kravetz <mike.kravetz@oracle.com>
      Tested-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Reviewed-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Acked-by: NMuchun Song <songmuchun@bytedance.com>
      Reported-by: Nkernel test robot <lkp@intel.com>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
      Cc: James Houghton <jthoughton@google.com>
      Cc: Mina Almasry <almasrymina@google.com>
      Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Paul Walmsley <paul.walmsley@sifive.com>
      Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Will Deacon <will@kernel.org>
      Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
      Cc: David Hildenbrand <david@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      e95a9851
    • A
      mm: rename is_pinnable_page() to is_longterm_pinnable_page() · 6077c943
      Alex Sierra 提交于
      Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory
      mapping", v9.
      
      This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
      owned by a device that can be mapped into CPU page tables like
      MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE.
      
      This patch series is mostly self-contained except for a few places where
      it needs to update other subsystems to handle the new memory type.
      
      System stability and performance are not affected according to our ongoing
      testing, including xfstests.
      
      How it works: The system BIOS advertises the GPU device memory (aka VRAM)
      as SPM (special purpose memory) in the UEFI system address map.
      
      The amdgpu driver registers the memory with devmap as
      MEMORY_DEVICE_COHERENT using devm_memremap_pages.  The initial user for
      this hardware page migration capability is the Frontier supercomputer
      project.  This functionality is not AMD-specific.  We expect other GPU
      vendors to find this functionality useful, and possibly other hardware
      types in the future.
      
      Our test nodes in the lab are similar to the Frontier configuration, with
      .5 TB of system memory plus 256 GB of device memory split across 4 GPUs,
      all in a single coherent address space.  Page migration is expected to
      improve application efficiency significantly.  We will report empirical
      results as they become available.
      
      Coherent device type pages at gup are now migrated back to system memory
      if they are being pinned long-term (FOLL_LONGTERM).  The reason is, that
      long-term pinning would interfere with the device memory manager owning
      the device-coherent pages (e.g.  evictions in TTM).  These series
      incorporate Alistair Popple patches to do this migration from
      pin_user_pages() calls.  hmm_gup_test has been added to hmm-test to test
      different get user pages calls.
      
      This series includes handling of device-managed anonymous pages returned
      by vm_normal_pages.  Although they behave like normal pages for purposes
      of mapping in CPU page tables and for COW, they do not support LRU lists,
      NUMA migration or THP.
      
      We also introduced a FOLL_LRU flag that adds the same behaviour to
      follow_page and related APIs, to allow callers to specify that they expect
      to put pages on an LRU list.
      
      
      This patch (of 14):
      
      is_pinnable_page() and folio_is_pinnable() are renamed to
      is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively.
      These functions are used in the FOLL_LONGTERM flag context.
      
      Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com
      Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.comSigned-off-by: NAlex Sierra <alex.sierra@amd.com>
      Reviewed-by: NDavid Hildenbrand <david@redhat.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: Felix Kuehling <Felix.Kuehling@amd.com>
      Cc: Ralph Campbell <rcampbell@nvidia.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Matthew Wilcox <willy@infradead.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6077c943
  7. 04 7月, 2022 5 次提交
  8. 29 6月, 2022 1 次提交
  9. 28 6月, 2022 1 次提交
  10. 02 6月, 2022 1 次提交
  11. 27 5月, 2022 1 次提交
  12. 13 5月, 2022 8 次提交
    • P
      mm/hugetlb: handle uffd-wp during fork() · bc70fbf2
      Peter Xu 提交于
      Firstly, we'll need to pass in dst_vma into copy_hugetlb_page_range()
      because for uffd-wp it's the dst vma that matters on deciding how we
      should treat uffd-wp protected ptes.
      
      We should recognize pte markers during fork and do the pte copy if needed.
      
      [lkp@intel.com: vma_needs_copy can be static]
        Link: https://lkml.kernel.org/r/Ylb0CGeFJlc4EzLk@7ec4ff11d4ae
      Link: https://lkml.kernel.org/r/20220405014918.14932-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      bc70fbf2
    • P
      mm/hugetlb: only drop uffd-wp special pte if required · 05e90bd0
      Peter Xu 提交于
      As with shmem uffd-wp special ptes, only drop the uffd-wp special swap pte
      if unmapping an entire vma or synchronized such that faults can not race
      with the unmap operation.  This requires passing zap_flags all the way to
      the lowest level hugetlb unmap routine: __unmap_hugepage_range.
      
      In general, unmap calls originated in hugetlbfs code will pass the
      ZAP_FLAG_DROP_MARKER flag as synchronization is in place to prevent
      faults.  The exception is hole punch which will first unmap without any
      synchronization.  Later when hole punch actually removes the page from the
      file, it will check to see if there was a subsequent fault and if so take
      the hugetlb fault mutex while unmapping again.  This second unmap will
      pass in ZAP_FLAG_DROP_MARKER.
      
      The justification of "whether to apply ZAP_FLAG_DROP_MARKER flag when
      unmap a hugetlb range" is (IMHO): we should never reach a state when a
      page fault could errornously fault in a page-cache page that was
      wr-protected to be writable, even in an extremely short period.  That
      could happen if e.g.  we pass ZAP_FLAG_DROP_MARKER when
      hugetlbfs_punch_hole() calls hugetlb_vmdelete_list(), because if a page
      faults after that call and before remove_inode_hugepages() is executed,
      the page cache can be mapped writable again in the small racy window, that
      can cause unexpected data overwritten.
      
      [peterx@redhat.com: fix sparse warning]
        Link: https://lkml.kernel.org/r/Ylcdw8I1L5iAoWhb@xz-m1.local
      [akpm@linux-foundation.org: move zap_flags_t from mm.h to mm_types.h to fix build issues]
      Link: https://lkml.kernel.org/r/20220405014915.14873-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      05e90bd0
    • P
      mm/hugetlb: allow uffd wr-protect none ptes · 60dfaad6
      Peter Xu 提交于
      Teach hugetlbfs code to wr-protect none ptes just in case the page cache
      existed for that pte.  Meanwhile we also need to be able to recognize a
      uffd-wp marker pte and remove it for uffd_wp_resolve.
      
      Since at it, introduce a variable "psize" to replace all references to the
      huge page size fetcher.
      
      Link: https://lkml.kernel.org/r/20220405014912.14815-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      60dfaad6
    • P
      mm/hugetlb: handle pte markers in page faults · c64e912c
      Peter Xu 提交于
      Allow hugetlb code to handle pte markers just like none ptes.  It's mostly
      there, we just need to make sure we don't assume hugetlb_no_page() only
      handles none pte, so when detecting pte change we should use pte_same()
      rather than pte_none().  We need to pass in the old_pte to do the
      comparison.
      
      Check the original pte to see whether it's a pte marker, if it is, we
      should recover uffd-wp bit on the new pte to be installed, so that the
      next write will be trapped by uffd.
      
      Link: https://lkml.kernel.org/r/20220405014909.14761-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      c64e912c
    • P
      mm/hugetlb: handle UFFDIO_WRITEPROTECT · 5a90d5a1
      Peter Xu 提交于
      This starts from passing cp_flags into hugetlb_change_protection() so
      hugetlb will be able to handle MM_CP_UFFD_WP[_RESOLVE] requests.
      
      huge_pte_clear_uffd_wp() is introduced to handle the case where the
      UFFDIO_WRITEPROTECT is requested upon migrating huge page entries.
      
      Link: https://lkml.kernel.org/r/20220405014906.14708-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      5a90d5a1
    • P
      mm/hugetlb: take care of UFFDIO_COPY_MODE_WP · 6041c691
      Peter Xu 提交于
      Pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout the
      stack.  Apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with UFFDIO_COPY.
      
      Hugetlb pages are only managed by hugetlbfs, so we're safe even without
      setting dirty bit in the huge pte if the page is installed as read-only. 
      However we'd better still keep the dirty bit set for a read-only
      UFFDIO_COPY pte (when UFFDIO_COPY_MODE_WP bit is set), not only to match
      what we do with shmem, but also because the page does contain dirty data
      that the kernel just copied from the userspace.
      
      Link: https://lkml.kernel.org/r/20220405014904.14643-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6041c691
    • P
      mm/hugetlb: hook page faults for uffd write protection · 166f3ecc
      Peter Xu 提交于
      Hook up hugetlbfs_fault() with the capability to handle userfaultfd-wp
      faults.
      
      We do this slightly earlier than hugetlb_cow() so that we can avoid taking
      some extra locks that we definitely don't need.
      
      Link: https://lkml.kernel.org/r/20220405014901.14590-1-peterx@redhat.comSigned-off-by: NPeter Xu <peterx@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Axel Rasmussen <axelrasmussen@google.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
      Cc: Matthew Wilcox <willy@infradead.org>
      Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
      Cc: Nadav Amit <nadav.amit@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      166f3ecc
    • B
      mm: hugetlb: considering PMD sharing when flushing cache/TLBs · 3d0b95cd
      Baolin Wang 提交于
      This patchset fixes some cache flushing issues if PMD sharing is possible
      for hugetlb pages, which were found by code inspection.  Meanwhile Mike
      found the flush_cache_page() can not cover the whole size of a hugetlb
      page on some architectures [1], so I added a new patch 3 to fix this
      issue, since I found only try_to_unmap_one() and try_to_migrate_one() need
      to fix after some investigation.
      
      [1] https://lore.kernel.org/linux-mm/064da3bb-5b4b-7332-a722-c5a541128705@oracle.com/
      
      
      This patch (of 3):
      
      When moving hugetlb page tables, the cache flushing is called in
      move_page_tables() without considering the shared PMDs, which may be cause
      cache issues on some architectures.
      
      Thus we should move the hugetlb cache flushing into
      move_hugetlb_page_tables() with considering the shared PMDs ranges,
      calculated by adjust_range_if_pmd_sharing_possible().  Meanwhile also
      expanding the TLBs flushing range in case of shared PMDs.
      
      Note this is discovered via code inspection, and did not meet a real
      problem in practice so far.
      
      Link: https://lkml.kernel.org/r/cover.1651056365.git.baolin.wang@linux.alibaba.com
      Link: https://lkml.kernel.org/r/0443c8cf20db554d3ff4b439b30e0ff26c0181dd.1651056365.git.baolin.wang@linux.alibaba.com
      Fixes: 550a7d60 ("mm, hugepages: add mremap() support for hugepage backed vma")
      Signed-off-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Reviewed-by: NMuchun Song <songmuchun@bytedance.com>
      Cc: Mina Almasry <almasrymina@google.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      3d0b95cd
  13. 10 5月, 2022 8 次提交
    • D
      mm/gup: sanity-check with CONFIG_DEBUG_VM that anonymous pages are exclusive when (un)pinning · b6a2619c
      David Hildenbrand 提交于
      Let's verify when (un)pinning anonymous pages that we always deal with
      exclusive anonymous pages, which guarantees that we'll have a reliable
      PIN, meaning that we cannot end up with the GUP pin being inconsistent
      with he pages mapped into the page tables due to a COW triggered by a
      write fault.
      
      When pinning pages, after conditionally triggering GUP unsharing of
      possibly shared anonymous pages, we should always only see exclusive
      anonymous pages.  Note that anonymous pages that are mapped writable must
      be marked exclusive, otherwise we'd have a BUG.
      
      When pinning during ordinary GUP, simply add a check after our conditional
      GUP-triggered unsharing checks.  As we know exactly how the page is
      mapped, we know exactly in which page we have to check for
      PageAnonExclusive().
      
      When pinning via GUP-fast we have to be careful, because we can race with
      fork(): verify only after we made sure via the seqcount that we didn't
      race with concurrent fork() that we didn't end up pinning a possibly
      shared anonymous page.
      
      Similarly, when unpinning, verify that the pages are still marked as
      exclusive: otherwise something turned the pages possibly shared, which can
      result in random memory corruptions, which we really want to catch.
      
      With only the pinned pages at hand and not the actual page table entries
      we have to be a bit careful: hugetlb pages are always mapped via a single
      logical page table entry referencing the head page and PG_anon_exclusive
      of the head page applies.  Anon THP are a bit more complicated, because we
      might have obtained the page reference either via a PMD or a PTE --
      depending on the mapping type we either have to check PageAnonExclusive of
      the head page (PMD-mapped THP) or the tail page (PTE-mapped THP) applies:
      as we don't know and to make our life easier, check that either is set.
      
      Take care to not verify in case we're unpinning during GUP-fast because we
      detected concurrent fork(): we might stumble over an anonymous page that
      is now shared.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-18-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      b6a2619c
    • D
      mm/gup: trigger FAULT_FLAG_UNSHARE when R/O-pinning a possibly shared anonymous page · a7f22660
      David Hildenbrand 提交于
      Whenever GUP currently ends up taking a R/O pin on an anonymous page that
      might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault
      on the page table entry will end up replacing the mapped anonymous page
      due to COW, resulting in the GUP pin no longer being consistent with the
      page actually mapped into the page table.
      
      The possible ways to deal with this situation are:
       (1) Ignore and pin -- what we do right now.
       (2) Fail to pin -- which would be rather surprising to callers and
           could break user space.
       (3) Trigger unsharing and pin the now exclusive page -- reliable R/O
           pins.
      
      Let's implement 3) because it provides the clearest semantics and allows
      for checking in unpin_user_pages() and friends for possible BUGs: when
      trying to unpin a page that's no longer exclusive, clearly something went
      very wrong and might result in memory corruptions that might be hard to
      debug.  So we better have a nice way to spot such issues.
      
      This change implies that whenever user space *wrote* to a private mapping
      (IOW, we have an anonymous page mapped), that GUP pins will always remain
      consistent: reliable R/O GUP pins of anonymous pages.
      
      As a side note, this commit fixes the COW security issue for hugetlb with
      FOLL_PIN as documented in:
        https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com
      The vmsplice reproducer still applies, because vmsplice uses FOLL_GET
      instead of FOLL_PIN.
      
      Note that follow_huge_pmd() doesn't apply because we cannot end up in
      there with FOLL_PIN.
      
      This commit is heavily based on prototype patches by Andrea.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-17-david@redhat.comSigned-off-by: NAndrea Arcangeli <aarcange@redhat.com>
      Signed-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Co-developed-by: NAndrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      a7f22660
    • D
      mm: support GUP-triggered unsharing of anonymous pages · c89357e2
      David Hildenbrand 提交于
      Whenever GUP currently ends up taking a R/O pin on an anonymous page that
      might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault
      on the page table entry will end up replacing the mapped anonymous page
      due to COW, resulting in the GUP pin no longer being consistent with the
      page actually mapped into the page table.
      
      The possible ways to deal with this situation are:
       (1) Ignore and pin -- what we do right now.
       (2) Fail to pin -- which would be rather surprising to callers and
           could break user space.
       (3) Trigger unsharing and pin the now exclusive page -- reliable R/O
           pins.
      
      We want to implement 3) because it provides the clearest semantics and
      allows for checking in unpin_user_pages() and friends for possible BUGs:
      when trying to unpin a page that's no longer exclusive, clearly something
      went very wrong and might result in memory corruptions that might be hard
      to debug.  So we better have a nice way to spot such issues.
      
      To implement 3), we need a way for GUP to trigger unsharing:
      FAULT_FLAG_UNSHARE.  FAULT_FLAG_UNSHARE is only applicable to R/O mapped
      anonymous pages and resembles COW logic during a write fault.  However, in
      contrast to a write fault, GUP-triggered unsharing will, for example,
      still maintain the write protection.
      
      Let's implement FAULT_FLAG_UNSHARE by hooking into the existing write
      fault handlers for all applicable anonymous page types: ordinary pages,
      THP and hugetlb.
      
      * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that has been
        marked exclusive in the meantime by someone else, there is nothing to do.
      * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that's not
        marked exclusive, it will try detecting if the process is the exclusive
        owner. If exclusive, it can be set exclusive similar to reuse logic
        during write faults via page_move_anon_rmap() and there is nothing
        else to do; otherwise, we either have to copy and map a fresh,
        anonymous exclusive page R/O (ordinary pages, hugetlb), or split the
        THP.
      
      This commit is heavily based on patches by Andrea.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-16-david@redhat.comSigned-off-by: NAndrea Arcangeli <aarcange@redhat.com>
      Signed-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Co-developed-by: NAndrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      c89357e2
    • D
      mm/gup: disallow follow_page(FOLL_PIN) · 8909691b
      David Hildenbrand 提交于
      We want to change the way we handle R/O pins on anonymous pages that might
      be shared: if we detect a possibly shared anonymous page -- mapped R/O and
      not !PageAnonExclusive() -- we want to trigger unsharing via a page fault,
      resulting in an exclusive anonymous page that can be pinned reliably
      without getting replaced via COW on the next write fault.
      
      However, the required page fault will be problematic for follow_page(): in
      contrast to ordinary GUP, follow_page() doesn't trigger faults internally.
      So we would have to end up failing a R/O pin via follow_page(), although
      there is something mapped R/O into the page table, which might be rather
      surprising.
      
      We don't seem to have follow_page(FOLL_PIN) users, and it's a purely
      internal MM function.  Let's just make our life easier and the semantics
      of follow_page() clearer by just disallowing FOLL_PIN for follow_page()
      completely.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-15-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      8909691b
    • D
      mm: remember exclusively mapped anonymous pages with PG_anon_exclusive · 6c287605
      David Hildenbrand 提交于
      Let's mark exclusively mapped anonymous pages with PG_anon_exclusive as
      exclusive, and use that information to make GUP pins reliable and stay
      consistent with the page mapped into the page table even if the page table
      entry gets write-protected.
      
      With that information at hand, we can extend our COW logic to always reuse
      anonymous pages that are exclusive.  For anonymous pages that might be
      shared, the existing logic applies.
      
      As already documented, PG_anon_exclusive is usually only expressive in
      combination with a page table entry.  Especially PTE vs.  PMD-mapped
      anonymous pages require more thought, some examples: due to mremap() we
      can easily have a single compound page PTE-mapped into multiple page
      tables exclusively in a single process -- multiple page table locks apply.
      Further, due to MADV_WIPEONFORK we might not necessarily write-protect
      all PTEs, and only some subpages might be pinned.  Long story short: once
      PTE-mapped, we have to track information about exclusivity per sub-page,
      but until then, we can just track it for the compound page in the head
      page and not having to update a whole bunch of subpages all of the time
      for a simple PMD mapping of a THP.
      
      For simplicity, this commit mostly talks about "anonymous pages", while
      it's for THP actually "the part of an anonymous folio referenced via a
      page table entry".
      
      To not spill PG_anon_exclusive code all over the mm code-base, we let the
      anon rmap code to handle all PG_anon_exclusive logic it can easily handle.
      
      If a writable, present page table entry points at an anonymous (sub)page,
      that (sub)page must be PG_anon_exclusive.  If GUP wants to take a reliably
      pin (FOLL_PIN) on an anonymous page references via a present page table
      entry, it must only pin if PG_anon_exclusive is set for the mapped
      (sub)page.
      
      This commit doesn't adjust GUP, so this is only implicitly handled for
      FOLL_WRITE, follow-up commits will teach GUP to also respect it for
      FOLL_PIN without FOLL_WRITE, to make all GUP pins of anonymous pages fully
      reliable.
      
      Whenever an anonymous page is to be shared (fork(), KSM), or when
      temporarily unmapping an anonymous page (swap, migration), the relevant
      PG_anon_exclusive bit has to be cleared to mark the anonymous page
      possibly shared.  Clearing will fail if there are GUP pins on the page:
      
      * For fork(), this means having to copy the page and not being able to
        share it.  fork() protects against concurrent GUP using the PT lock and
        the src_mm->write_protect_seq.
      
      * For KSM, this means sharing will fail.  For swap this means, unmapping
        will fail, For migration this means, migration will fail early.  All
        three cases protect against concurrent GUP using the PT lock and a
        proper clear/invalidate+flush of the relevant page table entry.
      
      This fixes memory corruptions reported for FOLL_PIN | FOLL_WRITE, when a
      pinned page gets mapped R/O and the successive write fault ends up
      replacing the page instead of reusing it.  It improves the situation for
      O_DIRECT/vmsplice/...  that still use FOLL_GET instead of FOLL_PIN, if
      fork() is *not* involved, however swapout and fork() are still
      problematic.  Properly using FOLL_PIN instead of FOLL_GET for these GUP
      users will fix the issue for them.
      
      I. Details about basic handling
      
      I.1. Fresh anonymous pages
      
      page_add_new_anon_rmap() and hugepage_add_new_anon_rmap() will mark the
      given page exclusive via __page_set_anon_rmap(exclusive=1).  As that is
      the mechanism fresh anonymous pages come into life (besides migration code
      where we copy the page->mapping), all fresh anonymous pages will start out
      as exclusive.
      
      I.2. COW reuse handling of anonymous pages
      
      When a COW handler stumbles over a (sub)page that's marked exclusive, it
      simply reuses it.  Otherwise, the handler tries harder under page lock to
      detect if the (sub)page is exclusive and can be reused.  If exclusive,
      page_move_anon_rmap() will mark the given (sub)page exclusive.
      
      Note that hugetlb code does not yet check for PageAnonExclusive(), as it
      still uses the old COW logic that is prone to the COW security issue
      because hugetlb code cannot really tolerate unnecessary/wrong COW as huge
      pages are a scarce resource.
      
      I.3. Migration handling
      
      try_to_migrate() has to try marking an exclusive anonymous page shared via
      page_try_share_anon_rmap().  If it fails because there are GUP pins on the
      page, unmap fails.  migrate_vma_collect_pmd() and
      __split_huge_pmd_locked() are handled similarly.
      
      Writable migration entries implicitly point at shared anonymous pages. 
      For readable migration entries that information is stored via a new
      "readable-exclusive" migration entry, specific to anonymous pages.
      
      When restoring a migration entry in remove_migration_pte(), information
      about exlusivity is detected via the migration entry type, and
      RMAP_EXCLUSIVE is set accordingly for
      page_add_anon_rmap()/hugepage_add_anon_rmap() to restore that information.
      
      I.4. Swapout handling
      
      try_to_unmap() has to try marking the mapped page possibly shared via
      page_try_share_anon_rmap().  If it fails because there are GUP pins on the
      page, unmap fails.  For now, information about exclusivity is lost.  In
      the future, we might want to remember that information in the swap entry
      in some cases, however, it requires more thought, care, and a way to store
      that information in swap entries.
      
      I.5. Swapin handling
      
      do_swap_page() will never stumble over exclusive anonymous pages in the
      swap cache, as try_to_migrate() prohibits that.  do_swap_page() always has
      to detect manually if an anonymous page is exclusive and has to set
      RMAP_EXCLUSIVE for page_add_anon_rmap() accordingly.
      
      I.6. THP handling
      
      __split_huge_pmd_locked() has to move the information about exclusivity
      from the PMD to the PTEs.
      
      a) In case we have a readable-exclusive PMD migration entry, simply
         insert readable-exclusive PTE migration entries.
      
      b) In case we have a present PMD entry and we don't want to freeze
         ("convert to migration entries"), simply forward PG_anon_exclusive to
         all sub-pages, no need to temporarily clear the bit.
      
      c) In case we have a present PMD entry and want to freeze, handle it
         similar to try_to_migrate(): try marking the page shared first.  In
         case we fail, we ignore the "freeze" instruction and simply split
         ordinarily.  try_to_migrate() will properly fail because the THP is
         still mapped via PTEs.
      
      When splitting a compound anonymous folio (THP), the information about
      exclusivity is implicitly handled via the migration entries: no need to
      replicate PG_anon_exclusive manually.
      
      I.7.  fork() handling fork() handling is relatively easy, because
      PG_anon_exclusive is only expressive for some page table entry types.
      
      a) Present anonymous pages
      
      page_try_dup_anon_rmap() will mark the given subpage shared -- which will
      fail if the page is pinned.  If it failed, we have to copy (or PTE-map a
      PMD to handle it on the PTE level).
      
      Note that device exclusive entries are just a pointer at a PageAnon()
      page.  fork() will first convert a device exclusive entry to a present
      page table and handle it just like present anonymous pages.
      
      b) Device private entry
      
      Device private entries point at PageAnon() pages that cannot be mapped
      directly and, therefore, cannot get pinned.
      
      page_try_dup_anon_rmap() will mark the given subpage shared, which cannot
      fail because they cannot get pinned.
      
      c) HW poison entries
      
      PG_anon_exclusive will remain untouched and is stale -- the page table
      entry is just a placeholder after all.
      
      d) Migration entries
      
      Writable and readable-exclusive entries are converted to readable entries:
      possibly shared.
      
      I.8. mprotect() handling
      
      mprotect() only has to properly handle the new readable-exclusive
      migration entry:
      
      When write-protecting a migration entry that points at an anonymous page,
      remember the information about exclusivity via the "readable-exclusive"
      migration entry type.
      
      II. Migration and GUP-fast
      
      Whenever replacing a present page table entry that maps an exclusive
      anonymous page by a migration entry, we have to mark the page possibly
      shared and synchronize against GUP-fast by a proper clear/invalidate+flush
      to make the following scenario impossible:
      
      1. try_to_migrate() places a migration entry after checking for GUP pins
         and marks the page possibly shared.
      
      2. GUP-fast pins the page due to lack of synchronization
      
      3. fork() converts the "writable/readable-exclusive" migration entry into a
         readable migration entry
      
      4. Migration fails due to the GUP pin (failing to freeze the refcount)
      
      5. Migration entries are restored. PG_anon_exclusive is lost
      
      -> We have a pinned page that is not marked exclusive anymore.
      
      Note that we move information about exclusivity from the page to the
      migration entry as it otherwise highly overcomplicates fork() and
      PTE-mapping a THP.
      
      III. Swapout and GUP-fast
      
      Whenever replacing a present page table entry that maps an exclusive
      anonymous page by a swap entry, we have to mark the page possibly shared
      and synchronize against GUP-fast by a proper clear/invalidate+flush to
      make the following scenario impossible:
      
      1. try_to_unmap() places a swap entry after checking for GUP pins and
         clears exclusivity information on the page.
      
      2. GUP-fast pins the page due to lack of synchronization.
      
      -> We have a pinned page that is not marked exclusive anymore.
      
      If we'd ever store information about exclusivity in the swap entry,
      similar to migration handling, the same considerations as in II would
      apply.  This is future work.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-13-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6c287605
    • D
      mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages · 78fbe906
      David Hildenbrand 提交于
      The basic question we would like to have a reliable and efficient answer
      to is: is this anonymous page exclusive to a single process or might it be
      shared?  We need that information for ordinary/single pages, hugetlb
      pages, and possibly each subpage of a THP.
      
      Introduce a way to mark an anonymous page as exclusive, with the ultimate
      goal of teaching our COW logic to not do "wrong COWs", whereby GUP pins
      lose consistency with the pages mapped into the page table, resulting in
      reported memory corruptions.
      
      Most pageflags already have semantics for anonymous pages, however,
      PG_mappedtodisk should never apply to pages in the swapcache, so let's
      reuse that flag.
      
      As PG_has_hwpoisoned also uses that flag on the second tail page of a
      compound page, convert it to PG_error instead, which is marked as
      PF_NO_TAIL, so never used for tail pages.
      
      Use custom page flag modification functions such that we can do additional
      sanity checks.  The semantics we'll put into some kernel doc in the future
      are:
      
      "
        PG_anon_exclusive is *usually* only expressive in combination with a
        page table entry. Depending on the page table entry type it might
        store the following information:
      
             Is what's mapped via this page table entry exclusive to the
             single process and can be mapped writable without further
             checks? If not, it might be shared and we might have to COW.
      
        For now, we only expect PTE-mapped THPs to make use of
        PG_anon_exclusive in subpages. For other anonymous compound
        folios (i.e., hugetlb), only the head page is logically mapped and
        holds this information.
      
        For example, an exclusive, PMD-mapped THP only has PG_anon_exclusive
        set on the head page. When replacing the PMD by a page table full
        of PTEs, PG_anon_exclusive, if set on the head page, will be set on
        all tail pages accordingly. Note that converting from a PTE-mapping
        to a PMD mapping using the same compound page is currently not
        possible and consequently doesn't require care.
      
        If GUP wants to take a reliable pin (FOLL_PIN) on an anonymous page,
        it should only pin if the relevant PG_anon_exclusive is set. In that
        case, the pin will be fully reliable and stay consistent with the pages
        mapped into the page table, as the bit cannot get cleared (e.g., by
        fork(), KSM) while the page is pinned. For anonymous pages that
        are mapped R/W, PG_anon_exclusive can be assumed to always be set
        because such pages cannot possibly be shared.
      
        The page table lock protecting the page table entry is the primary
        synchronization mechanism for PG_anon_exclusive; GUP-fast that does
        not take the PT lock needs special care when trying to clear the
        flag.
      
        Page table entry types and PG_anon_exclusive:
        * Present: PG_anon_exclusive applies.
        * Swap: the information is lost. PG_anon_exclusive was cleared.
        * Migration: the entry holds this information instead.
                     PG_anon_exclusive was cleared.
        * Device private: PG_anon_exclusive applies.
        * Device exclusive: PG_anon_exclusive applies.
        * HW Poison: PG_anon_exclusive is stale and not changed.
      
        If the page may be pinned (FOLL_PIN), clearing PG_anon_exclusive is
        not allowed and the flag will stick around until the page is freed
        and folio->mapping is cleared.
      "
      
      We won't be clearing PG_anon_exclusive on destructive unmapping (i.e.,
      zapping) of page table entries, page freeing code will handle that when
      also invalidate page->mapping to not indicate PageAnon() anymore.  Letting
      information about exclusivity stick around will be an important property
      when adding sanity checks to unpinning code.
      
      Note that we properly clear the flag in free_pages_prepare() via
      PAGE_FLAGS_CHECK_AT_PREP for each individual subpage of a compound page,
      so there is no need to manually clear the flag.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-12-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      78fbe906
    • D
      mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap() · fb3d824d
      David Hildenbrand 提交于
      ...  and move the special check for pinned pages into
      page_try_dup_anon_rmap() to prepare for tracking exclusive anonymous pages
      via a new pageflag, clearing it only after making sure that there are no
      GUP pins on the anonymous page.
      
      We really only care about pins on anonymous pages, because they are prone
      to getting replaced in the COW handler once mapped R/O.  For !anon pages
      in cow-mappings (!VM_SHARED && VM_MAYWRITE) we shouldn't really care about
      that, at least not that I could come up with an example.
      
      Let's drop the is_cow_mapping() check from page_needs_cow_for_dma(), as we
      know we're dealing with anonymous pages.  Also, drop the handling of
      pinned pages from copy_huge_pud() and add a comment if ever supporting
      anonymous pages on the PUD level.
      
      This is a preparation for tracking exclusivity of anonymous pages in the
      rmap code, and disallowing marking a page shared (-> failing to duplicate)
      if there are GUP pins on a page.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-5-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      fb3d824d
    • D
      mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range() · 623a1ddf
      David Hildenbrand 提交于
      Let's do it just like copy_page_range(), taking the seqlock and making
      sure the mmap_lock is held in write mode.
      
      This allows for add a VM_BUG_ON to page_needs_cow_for_dma() and properly
      synchronizes concurrent fork() with GUP-fast of hugetlb pages, which will
      be relevant for further changes.
      
      Link: https://lkml.kernel.org/r/20220428083441.37290-3-david@redhat.comSigned-off-by: NDavid Hildenbrand <david@redhat.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Andrea Arcangeli <aarcange@redhat.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Don Dutile <ddutile@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jann Horn <jannh@google.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: John Hubbard <jhubbard@nvidia.com>
      Cc: Khalid Aziz <khalid.aziz@oracle.com>
      Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Liang Zhang <zhangliang5@huawei.com>
      Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Mike Rapoport <rppt@linux.ibm.com>
      Cc: Nadav Amit <namit@vmware.com>
      Cc: Oded Gabbay <oded.gabbay@gmail.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rik van Riel <riel@surriel.com>
      Cc: Roman Gushchin <guro@fb.com>
      Cc: Shakeel Butt <shakeelb@google.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      623a1ddf
  14. 30 4月, 2022 2 次提交