1. 04 10月, 2022 3 次提交
  2. 27 9月, 2022 1 次提交
  3. 12 9月, 2022 15 次提交
  4. 29 8月, 2022 1 次提交
  5. 21 8月, 2022 1 次提交
    • D
      mm/hugetlb: support write-faults in shared mappings · 1d8d1464
      David Hildenbrand 提交于
      If we ever get a write-fault on a write-protected page in a shared
      mapping, we'd be in trouble (again).  Instead, we can simply map the page
      writable.
      
      And in fact, there is even a way right now to trigger that code via
      uffd-wp ever since we stared to support it for shmem in 5.19:
      
      --------------------------------------------------------------------------
       #include <stdio.h>
       #include <stdlib.h>
       #include <string.h>
       #include <fcntl.h>
       #include <unistd.h>
       #include <errno.h>
       #include <sys/mman.h>
       #include <sys/syscall.h>
       #include <sys/ioctl.h>
       #include <linux/userfaultfd.h>
      
       #define HUGETLB_SIZE (2 * 1024 * 1024u)
      
       static char *map;
       int uffd;
      
       static int temp_setup_uffd(void)
       {
       	struct uffdio_api uffdio_api;
       	struct uffdio_register uffdio_register;
       	struct uffdio_writeprotect uffd_writeprotect;
       	struct uffdio_range uffd_range;
      
       	uffd = syscall(__NR_userfaultfd,
       		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
       	if (uffd < 0) {
       		fprintf(stderr, "syscall() failed: %d\n", errno);
       		return -errno;
       	}
      
       	uffdio_api.api = UFFD_API;
       	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
       	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
       		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
       		return -errno;
       	}
      
       	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
       		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
       		return -ENOSYS;
       	}
      
       	/* Register UFFD-WP */
       	uffdio_register.range.start = (unsigned long) map;
       	uffdio_register.range.len = HUGETLB_SIZE;
       	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
       	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
       		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
       		return -errno;
       	}
      
       	/* Writeprotect a single page. */
       	uffd_writeprotect.range.start = (unsigned long) map;
       	uffd_writeprotect.range.len = HUGETLB_SIZE;
       	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
       	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
       		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
       		return -errno;
       	}
      
       	/* Unregister UFFD-WP without prior writeunprotection. */
       	uffd_range.start = (unsigned long) map;
       	uffd_range.len = HUGETLB_SIZE;
       	if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
       		fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno);
       		return -errno;
       	}
      
       	return 0;
       }
      
       int main(int argc, char **argv)
       {
       	int fd;
      
       	fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
       	if (!fd) {
       		fprintf(stderr, "open() failed\n");
       		return -errno;
       	}
       	if (ftruncate(fd, HUGETLB_SIZE)) {
       		fprintf(stderr, "ftruncate() failed\n");
       		return -errno;
       	}
      
       	map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
       	if (map == MAP_FAILED) {
       		fprintf(stderr, "mmap() failed\n");
       		return -errno;
       	}
      
       	*map = 0;
      
       	if (temp_setup_uffd())
       		return 1;
      
       	*map = 0;
      
       	return 0;
       }
      --------------------------------------------------------------------------
      
      Above test fails with SIGBUS when there is only a single free hugetlb page.
       # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
       # ./test
       Bus error (core dumped)
      
      And worse, with sufficient free hugetlb pages it will map an anonymous page
      into a shared mapping, for example, messing up accounting during unmap
      and breaking MAP_SHARED semantics:
       # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
       # ./test
       # cat /proc/meminfo | grep HugePages_
       HugePages_Total:       2
       HugePages_Free:        1
       HugePages_Rsvd:    18446744073709551615
       HugePages_Surp:        0
      
      Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
      unregistering and consequently keeps the PTE writeprotected.  Reason for
      this is to avoid the additional overhead when unregistering.  Note that
      this is the case also for !hugetlb and that we will end up with writable
      PTEs that still have the uffd-wp PTE bit set once we return from
      hugetlb_wp().  I'm not touching the uffd-wp PTE bit for now, because it
      seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
      
      VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates
      that MAP_SHARED handling was at least envisioned, but could never have
      worked as expected.
      
      While at it, make sure that we never end up in hugetlb_wp() on write
      faults without VM_WRITE, because we don't support maybe_mkwrite()
      semantics as commonly used in the !hugetlb case -- for example, in
      wp_page_reuse().
      
      Note that there is no need to do any kind of reservation in
      hugetlb_fault() in this case ...  because we already have a hugetlb page
      mapped R/O that we will simply map writable and we are not dealing with
      COW/unsharing.
      
      Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
      Fixes: b1f9e876 ("mm/uffd: enable write protection for shmem & hugetlbfs")
      Signed-off-by: NDavid Hildenbrand <david@redhat.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Bjorn Helgaas <bhelgaas@google.com>
      Cc: Cyrill Gorcunov <gorcunov@openvz.org>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jamie Liu <jamieliu@google.com>
      Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Cc: Pavel Emelyanov <xemul@parallels.com>
      Cc: Peter Feiner <pfeiner@google.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: <stable@vger.kernel.org>	[5.19]
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      1d8d1464
  6. 09 8月, 2022 4 次提交
    • N
      mm, hwpoison, hugetlb: support saving mechanism of raw error pages · 161df60e
      Naoya Horiguchi 提交于
      When handling memory error on a hugetlb page, the error handler tries to
      dissolve and turn it into 4kB pages.  If it's successfully dissolved,
      PageHWPoison flag is moved to the raw error page, so that's all right. 
      However, dissolve sometimes fails, then the error page is left as
      hwpoisoned hugepage.  It's useful if we can retry to dissolve it to save
      healthy pages, but that's not possible now because the information about
      where the raw error pages is lost.
      
      Use the private field of a few tail pages to keep that information.  The
      code path of shrinking hugepage pool uses this info to try delayed
      dissolve.  In order to remember multiple errors in a hugepage, a
      singly-linked list originated from SUBPAGE_INDEX_HWPOISON-th tail page is
      constructed.  Only simple operations (adding an entry or clearing all) are
      required and the list is assumed not to be very long, so this simple data
      structure should be enough.
      
      If we failed to save raw error info, the hwpoison hugepage has errors on
      unknown subpage, then this new saving mechanism does not work any more, so
      disable saving new raw error info and freeing hwpoison hugepages.
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-4-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reported-by: Nkernel test robot <lkp@intel.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      161df60e
    • N
      mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry · 3a194f3f
      Naoya Horiguchi 提交于
      follow_pud_mask() does not support non-present pud entry now.  As long as
      I tested on x86_64 server, follow_pud_mask() still simply returns
      no_page_table() for non-present_pud_entry() due to pud_bad(), so no severe
      user-visible effect should happen.  But generally we should call
      follow_huge_pud() for non-present pud entry for 1GB hugetlb page.
      
      Update pud_huge() and follow_huge_pud() to handle non-present pud entries.
      The changes are similar to previous works for pud entries commit
      e66f17ff ("mm/hugetlb: take page table lock in follow_huge_pmd()") and
      commit cbef8478 ("mm/hugetlb: pmd_huge() returns true for non-present
      hugepage").
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-3-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: kernel test robot <lkp@intel.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Yang Shi <shy828301@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      3a194f3f
    • N
      mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages() · c0531714
      Naoya Horiguchi 提交于
      Patch series "mm, hwpoison: enable 1GB hugepage support", v7.
      
      
      This patch (of 8):
      
      I found a weird state of 1GB hugepage pool, caused by the following
      procedure:
      
        - run a process reserving all free 1GB hugepages,
        - shrink free 1GB hugepage pool to zero (i.e. writing 0 to
          /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages), then
        - kill the reserving process.
      
      , then all the hugepages are free *and* surplus at the same time.
      
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
        3
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages
        3
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/resv_hugepages
        0
        $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/surplus_hugepages
        3
      
      This state is resolved by reserving and allocating the pages then freeing
      them again, so this seems not to result in serious problem.  But it's a
      little surprising (shrinking pool suddenly fails).
      
      This behavior is caused by hstate_is_gigantic() check in
      return_unused_surplus_pages().  This was introduced so long ago in 2008 by
      commit aa888a74 ("hugetlb: support larger than MAX_ORDER"), and at
      that time the gigantic pages were not supposed to be allocated/freed at
      run-time.  Now kernel can support runtime allocation/free, so let's check
      gigantic_page_runtime_supported() together.
      
      Link: https://lkml.kernel.org/r/20220714042420.1847125-1-naoya.horiguchi@linux.dev
      Link: https://lkml.kernel.org/r/20220714042420.1847125-2-naoya.horiguchi@linux.devSigned-off-by: NNaoya Horiguchi <naoya.horiguchi@nec.com>
      Reviewed-by: NMiaohe Lin <linmiaohe@huawei.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Liu Shixin <liushixin2@huawei.com>
      Cc: Yang Shi <shy828301@gmail.com>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Cc: kernel test robot <lkp@intel.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      c0531714
    • M
      mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability · 6213834c
      Muchun Song 提交于
      There is a discussion about the name of hugetlb_vmemmap_alloc/free in
      thread [1].  The suggestion suggested by David is rename "alloc/free" to
      "optimize/restore" to make functionalities clearer to users, "optimize"
      means the function will optimize vmemmap pages, while "restore" means
      restoring its vmemmap pages discared before.  This commit does this.
      
      Another discussion is the confusion RESERVE_VMEMMAP_NR isn't used
      explicitly for vmemmap_addr but implicitly for vmemmap_end in
      hugetlb_vmemmap_alloc/free.  David suggested we can compute what
      hugetlb_vmemmap_init() does now at runtime.  We do not need to worry for
      the overhead of computing at runtime since the calculation is simple
      enough and those functions are not in a hot path.  This commit has the
      following improvements:
      
        1) The function suffixed name ("optimize/restore") is more expressive.
        2) The logic becomes less weird in hugetlb_vmemmap_optimize/restore().
        3) The hugetlb_vmemmap_init() does not need to be exported anymore.
        4) A ->optimize_vmemmap_pages field in struct hstate is killed.
        5) There is only one place where checks is_power_of_2(sizeof(struct
           page)) instead of two places.
        6) Add more comments for hugetlb_vmemmap_optimize/restore().
        7) For external users, hugetlb_optimize_vmemmap_pages() is used for
           detecting if the HugeTLB's vmemmap pages is optimizable originally.
           In this commit, it is killed and we introduce a new helper
           hugetlb_vmemmap_optimizable() to replace it.  The name is more
           expressive.
      
      Link: https://lore.kernel.org/all/20220404074652.68024-2-songmuchun@bytedance.com/ [1]
      Link: https://lkml.kernel.org/r/20220628092235.91270-7-songmuchun@bytedance.comSigned-off-by: NMuchun Song <songmuchun@bytedance.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: Jonathan Corbet <corbet@lwn.net>
      Cc: Oscar Salvador <osalvador@suse.de>
      Cc: Will Deacon <will@kernel.org>
      Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6213834c
  7. 19 7月, 2022 2 次提交
  8. 18 7月, 2022 4 次提交
    • G
      mm, hugetlb: skip irrelevant nodes in show_free_areas() · dcadcf1c
      Gang Li 提交于
      show_free_areas() allows to filter out node specific data which is
      irrelevant to the allocation request.  But hugetlb_show_meminfo() still
      shows hugetlb on all nodes, which is redundant and unnecessary.
      
      Use show_mem_node_skip() to skip irrelevant nodes.  And replace
      hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid).
      
      before-and-after sample output of OOM:
      
      before:
      ```
      [  214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB
      [  214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
      [  214.388334] lowmem_reserve[]: 0 0 0 0 0
      [  214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20
      [  214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      [  214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      ```
      
      after:
      ```
      [  145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u
      [  145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
      [  145.152315] lowmem_reserve[]: 0 0 0 0 0
      [  145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME)
      [  145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
      ```
      
      Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.comSigned-off-by: NGang Li <ligang.bdlg@bytedance.com>
      Reviewed-by: NMike Kravetz <mike.kravetz@oracle.com>
      Cc: Muchun Song <songmuchun@bytedance.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      dcadcf1c
    • M
      hugetlb: do not update address in huge_pmd_unshare · 4ddb4d91
      Mike Kravetz 提交于
      As an optimization for loops sequentially processing hugetlb address
      ranges, huge_pmd_unshare would update a passed address if it unshared a
      pmd.  Updating a loop control variable outside the loop like this is
      generally a bad idea.  These loops are now using hugetlb_mask_last_page to
      optimize scanning when non-present ptes are discovered.  The same can be
      done when huge_pmd_unshare returns 1 indicating a pmd was unshared.
      
      Remove address update from huge_pmd_unshare.  Change the passed argument
      type and update all callers.  In loops sequentially processing addresses
      use hugetlb_mask_last_page to update address if pmd is unshared.
      
      [sfr@canb.auug.org.au: fix an unused variable warning/error]
        Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au
      Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.comSigned-off-by: NMike Kravetz <mike.kravetz@oracle.com>
      Signed-off-by: NStephen Rothwell <sfr@canb.auug.org.au>
      Acked-by: NMuchun Song <songmuchun@bytedance.com>
      Reviewed-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
      Cc: David Hildenbrand <david@redhat.com>
      Cc: James Houghton <jthoughton@google.com>
      Cc: kernel test robot <lkp@intel.com>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Mina Almasry <almasrymina@google.com>
      Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
      Cc: Paul Walmsley <paul.walmsley@sifive.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
      Cc: Will Deacon <will@kernel.org>
      Cc: Stephen Rothwell <sfr@canb.auug.org.au>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      4ddb4d91
    • M
      hugetlb: skip to end of PT page mapping when pte not present · e95a9851
      Mike Kravetz 提交于
      Patch series "hugetlb: speed up linear address scanning", v2.
      
      At unmap, fork and remap time hugetlb address ranges are linearly scanned.
      We can optimize these scans if the ranges are sparsely populated.
      
      Also, enable page table "Lazy copy" for hugetlb at fork.
      
      NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to
      add an arch specific version hugetlb_mask_last_page() to take advantage of
      sparse address scanning improvements.  Baolin Wang added the routine for
      arm64.  Other architectures which could be optimized are: ia64, mips,
      parisc, powerpc, s390, sh and sparc.
      
      
      This patch (of 4):
      
      HugeTLB address ranges are linearly scanned during fork, unmap and remap
      operations.  If a non-present entry is encountered, the code currently
      continues to the next huge page aligned address.  However, a non-present
      entry implies that the page table page for that entry is not present. 
      Therefore, the linear scan can skip to the end of range mapped by the page
      table page.  This can speed operations on large sparsely populated hugetlb
      mappings.
      
      Create a new routine hugetlb_mask_last_page() that will return an address
      mask.  When the mask is ORed with an address, the result will be the
      address of the last huge page mapped by the associated page table page. 
      Use this mask to update addresses in routines which linearly scan hugetlb
      address ranges when a non-present pte is encountered.
      
      hugetlb_mask_last_page is related to the implementation of huge_pte_offset
      as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. 
      This patch only provides a complete hugetlb_mask_last_page implementation
      when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined.  Architectures which
      provide their own versions of huge_pte_offset can also provide their own
      version of hugetlb_mask_last_page.
      
      Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com
      Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.comSigned-off-by: NMike Kravetz <mike.kravetz@oracle.com>
      Tested-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Reviewed-by: NBaolin Wang <baolin.wang@linux.alibaba.com>
      Acked-by: NMuchun Song <songmuchun@bytedance.com>
      Reported-by: Nkernel test robot <lkp@intel.com>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Peter Xu <peterx@redhat.com>
      Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
      Cc: James Houghton <jthoughton@google.com>
      Cc: Mina Almasry <almasrymina@google.com>
      Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Anshuman Khandual <anshuman.khandual@arm.com>
      Cc: Paul Walmsley <paul.walmsley@sifive.com>
      Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Will Deacon <will@kernel.org>
      Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
      Cc: David Hildenbrand <david@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      e95a9851
    • A
      mm: rename is_pinnable_page() to is_longterm_pinnable_page() · 6077c943
      Alex Sierra 提交于
      Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory
      mapping", v9.
      
      This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
      owned by a device that can be mapped into CPU page tables like
      MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE.
      
      This patch series is mostly self-contained except for a few places where
      it needs to update other subsystems to handle the new memory type.
      
      System stability and performance are not affected according to our ongoing
      testing, including xfstests.
      
      How it works: The system BIOS advertises the GPU device memory (aka VRAM)
      as SPM (special purpose memory) in the UEFI system address map.
      
      The amdgpu driver registers the memory with devmap as
      MEMORY_DEVICE_COHERENT using devm_memremap_pages.  The initial user for
      this hardware page migration capability is the Frontier supercomputer
      project.  This functionality is not AMD-specific.  We expect other GPU
      vendors to find this functionality useful, and possibly other hardware
      types in the future.
      
      Our test nodes in the lab are similar to the Frontier configuration, with
      .5 TB of system memory plus 256 GB of device memory split across 4 GPUs,
      all in a single coherent address space.  Page migration is expected to
      improve application efficiency significantly.  We will report empirical
      results as they become available.
      
      Coherent device type pages at gup are now migrated back to system memory
      if they are being pinned long-term (FOLL_LONGTERM).  The reason is, that
      long-term pinning would interfere with the device memory manager owning
      the device-coherent pages (e.g.  evictions in TTM).  These series
      incorporate Alistair Popple patches to do this migration from
      pin_user_pages() calls.  hmm_gup_test has been added to hmm-test to test
      different get user pages calls.
      
      This series includes handling of device-managed anonymous pages returned
      by vm_normal_pages.  Although they behave like normal pages for purposes
      of mapping in CPU page tables and for COW, they do not support LRU lists,
      NUMA migration or THP.
      
      We also introduced a FOLL_LRU flag that adds the same behaviour to
      follow_page and related APIs, to allow callers to specify that they expect
      to put pages on an LRU list.
      
      
      This patch (of 14):
      
      is_pinnable_page() and folio_is_pinnable() are renamed to
      is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively.
      These functions are used in the FOLL_LONGTERM flag context.
      
      Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com
      Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.comSigned-off-by: NAlex Sierra <alex.sierra@amd.com>
      Reviewed-by: NDavid Hildenbrand <david@redhat.com>
      Cc: Jason Gunthorpe <jgg@nvidia.com>
      Cc: Felix Kuehling <Felix.Kuehling@amd.com>
      Cc: Ralph Campbell <rcampbell@nvidia.com>
      Cc: Christoph Hellwig <hch@lst.de>
      Cc: Jerome Glisse <jglisse@redhat.com>
      Cc: Alistair Popple <apopple@nvidia.com>
      Cc: Matthew Wilcox <willy@infradead.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      6077c943
  9. 04 7月, 2022 5 次提交
  10. 29 6月, 2022 1 次提交
  11. 28 6月, 2022 1 次提交
  12. 02 6月, 2022 1 次提交
  13. 27 5月, 2022 1 次提交