1. 30 7月, 2018 1 次提交
  2. 13 6月, 2018 1 次提交
    • K
      treewide: Use array_size() in vmalloc() · 42bc47b3
      Kees Cook 提交于
      The vmalloc() function has no 2-factor argument form, so multiplication
      factors need to be wrapped in array_size(). This patch replaces cases of:
      
              vmalloc(a * b)
      
      with:
              vmalloc(array_size(a, b))
      
      as well as handling cases of:
      
              vmalloc(a * b * c)
      
      with:
      
              vmalloc(array3_size(a, b, c))
      
      This does, however, attempt to ignore constant size factors like:
      
              vmalloc(4 * 1024)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        vmalloc(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        vmalloc(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        vmalloc(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        vmalloc(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
        vmalloc(
      -	sizeof(TYPE) * (COUNT_ID)
      +	array_size(COUNT_ID, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * COUNT_ID
      +	array_size(COUNT_ID, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * (COUNT_CONST)
      +	array_size(COUNT_CONST, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * COUNT_CONST
      +	array_size(COUNT_CONST, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * (COUNT_ID)
      +	array_size(COUNT_ID, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * COUNT_ID
      +	array_size(COUNT_ID, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * (COUNT_CONST)
      +	array_size(COUNT_CONST, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * COUNT_CONST
      +	array_size(COUNT_CONST, sizeof(THING))
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
        vmalloc(
      -	SIZE * COUNT
      +	array_size(COUNT, SIZE)
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        vmalloc(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        vmalloc(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        vmalloc(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        vmalloc(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        vmalloc(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        vmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        vmalloc(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        vmalloc(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        vmalloc(C1 * C2 * C3, ...)
      |
        vmalloc(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants.
      @@
      expression E1, E2;
      constant C1, C2;
      @@
      
      (
        vmalloc(C1 * C2, ...)
      |
        vmalloc(
      -	E1 * E2
      +	array_size(E1, E2)
        , ...)
      )
      Signed-off-by: NKees Cook <keescook@chromium.org>
      42bc47b3
  3. 18 5月, 2018 1 次提交
    • N
      KVM: PPC: Book3S HV: Lockless tlbie for HPT hcalls · b7557451
      Nicholas Piggin 提交于
      tlbies to an LPAR do not have to be serialised since POWER4/PPC970,
      after which the MMU_FTR_LOCKLESS_TLBIE feature was introduced to
      avoid tlbie locking.
      
      Since commit c17b98cf ("KVM: PPC: Book3S HV: Remove code for
      PPC970 processors"), KVM no longer supports processors that do not
      have this feature, so the tlbie locking can be removed completely.
      A sanity check for the feature is put in kvmppc_mmu_hv_init.
      
      Testing was done on a POWER9 system in HPT mode, with a -smp 32 guest
      in HPT mode. 32 instances of the powerpc fork benchmark from selftests
      were run with --fork, and the results measured.
      
      Without this patch, total throughput was about 13.5K/sec, and this is
      the top of the host profile:
      
         74.52%  [k] do_tlbies
          2.95%  [k] kvmppc_book3s_hv_page_fault
          1.80%  [k] calc_checksum
          1.80%  [k] kvmppc_vcpu_run_hv
          1.49%  [k] kvmppc_run_core
      
      After this patch, throughput was about 51K/sec, with this profile:
      
         21.28%  [k] do_tlbies
          5.26%  [k] kvmppc_run_core
          4.88%  [k] kvmppc_book3s_hv_page_fault
          3.30%  [k] _raw_spin_lock_irqsave
          3.25%  [k] gup_pgd_range
      Signed-off-by: NNicholas Piggin <npiggin@gmail.com>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      b7557451
  4. 19 3月, 2018 1 次提交
  5. 09 2月, 2018 2 次提交
    • D
      KVM: PPC: Book3S HV: Make HPT resizing work on POWER9 · 790a9df5
      David Gibson 提交于
      This adds code to enable the HPT resizing code to work on POWER9,
      which uses a slightly modified HPT entry format compared to POWER8.
      On POWER9, we convert HPTEs read from the HPT from the new format to
      the old format so that the rest of the HPT resizing code can work as
      before.  HPTEs written to the new HPT are converted to the new format
      as the last step before writing them into the new HPT.
      
      This takes out the checks added by commit bcd3bb63 ("KVM: PPC:
      Book3S HV: Disable HPT resizing on POWER9 for now", 2017-02-18),
      now that HPT resizing works on POWER9.
      
      On POWER9, when we pivot to the new HPT, we now call
      kvmppc_setup_partition_table() to update the partition table in order
      to make the hardware use the new HPT.
      
      [paulus@ozlabs.org - added kvmppc_setup_partition_table() call,
       wrote commit message.]
      Tested-by: NLaurent Vivier <lvivier@redhat.com>
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      790a9df5
    • P
      KVM: PPC: Book3S HV: Fix handling of secondary HPTEG in HPT resizing code · 05f2bb03
      Paul Mackerras 提交于
      This fixes the computation of the HPTE index to use when the HPT
      resizing code encounters a bolted HPTE which is stored in its
      secondary HPTE group.  The code inverts the HPTE group number, which
      is correct, but doesn't then mask it with new_hash_mask.  As a result,
      new_pteg will be effectively negative, resulting in new_hptep
      pointing before the new HPT, which will corrupt memory.
      
      In addition, this removes two BUG_ON statements.  The condition that
      the BUG_ONs were testing -- that we have computed the hash value
      incorrectly -- has never been observed in testing, and if it did
      occur, would only affect the guest, not the host.  Given that
      BUG_ON should only be used in conditions where the kernel (i.e.
      the host kernel, in this case) can't possibly continue execution,
      it is not appropriate here.
      Reviewed-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      05f2bb03
  6. 10 1月, 2018 1 次提交
    • D
      KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() · ecba8297
      David Gibson 提交于
      The KVM_PPC_ALLOCATE_HTAB ioctl(), implemented by kvmppc_alloc_reset_hpt()
      is supposed to completely clear and reset a guest's Hashed Page Table (HPT)
      allocating or re-allocating it if necessary.
      
      In the case where an HPT of the right size already exists and it just
      zeroes it, it forces a TLB flush on all guest CPUs, to remove any stale TLB
      entries loaded from the old HPT.
      
      However, that situation can arise when the HPT is resizing as well - or
      even when switching from an RPT to HPT - so those cases need a TLB flush as
      well.
      
      So, move the TLB flush to trigger in all cases except for errors.
      
      Cc: stable@vger.kernel.org # v4.10+
      Fixes: f98a8bf9 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size")
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      ecba8297
  7. 06 12月, 2017 2 次提交
    • S
      KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests · 4ed11aee
      Serhii Popovych 提交于
      When serving multiple resize requests following could happen:
      
          CPU0                                    CPU1
          ----                                    ----
          kvm_vm_ioctl_resize_hpt_prepare(1);
            -> schedule_work()
                                                  /* system_rq might be busy: delay */
          kvm_vm_ioctl_resize_hpt_prepare(2);
            mutex_lock();
            if (resize) {
               ...
               release_hpt_resize();
            }
            ...                                   resize_hpt_prepare_work()
            -> schedule_work()                    {
            mutex_unlock()                           /* resize->kvm could be wrong */
                                                     struct kvm *kvm = resize->kvm;
      
                                                     mutex_lock(&kvm->lock);   <<<< UAF
                                                     ...
                                                  }
      
      i.e. a second resize request with different order could be started by
      kvm_vm_ioctl_resize_hpt_prepare(), causing the previous request to be
      free()d when there's still an active worker thread which will try to
      access it.  This leads to a use after free in point marked with UAF on
      the diagram above.
      
      To prevent this from happening, instead of unconditionally releasing a
      pre-existing resize structure from the prepare ioctl(), we check if
      the existing structure has an in-progress worker.  We do that by
      checking if the resize->error == -EBUSY, which is safe because the
      resize->error field is protected by the kvm->lock.  If there is an
      active worker, instead of releasing, we mark the structure as stale by
      unlinking it from kvm_struct.
      
      In the worker thread we check for a stale structure (with kvm->lock
      held), and in that case abort, releasing the stale structure ourself.
      We make the check both before and the actual allocation.  Strictly,
      only the check afterwards is needed, the check before is an
      optimization: if the structure happens to become stale before the
      worker thread is dispatched, rather than during the allocation, it
      means we can avoid allocating then immediately freeing a potentially
      substantial amount of memory.
      
      This fixes following or similar host kernel crash message:
      
      [  635.277361] Unable to handle kernel paging request for data at address 0x00000000
      [  635.277438] Faulting instruction address: 0xc00000000052f568
      [  635.277446] Oops: Kernel access of bad area, sig: 11 [#1]
      [  635.277451] SMP NR_CPUS=2048 NUMA PowerNV
      [  635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE
      nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4
      nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc
      ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs
      lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi
      scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp
      ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T)
      ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal
      i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs
      libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T)
      tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt
      pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod
      [  635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G
      ------------ T 3.10.0.bz1510771+ #1
      [  635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv]
      [  635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000
      [  635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0
      [  635.278988] REGS: c0000007e91837f0 TRAP: 0300   Tainted: G
      ------------ T  (3.10.0.bz1510771+)
      [  635.279077] MSR: 9000000100009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24002022  XER:
      00000000
      [  635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
      GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10
      GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000
      GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0
      GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40
      GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000
      GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7
      GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000
      GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10
      [  635.280149] NIP [c00000000052f568] __list_add+0x38/0x110
      [  635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
      [  635.280253] Call Trace:
      [  635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
      [  635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70
      [  635.280426] [c0000007e9183ba0] [d00000000d24da04]
      resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv]
      [  635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680
      [  635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520
      [  635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100
      [  635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4
      [  635.280814] Instruction dump:
      [  635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78
      f8010010
      [  635.281099] f821ff81 e8a50008 7fa52040 40de00b8 <e8be0000> 7fbd2840 40de008c
      7fbff040
      [  635.281324] ---[ end trace b628b73449719b9d ]---
      
      Cc: stable@vger.kernel.org # v4.10+
      Fixes: b5baa687 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation")
      Signed-off-by: NSerhii Popovych <spopovyc@redhat.com>
      [dwg: Replaced BUG_ON()s with WARN_ONs() and reworded commit message
       for clarity]
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      4ed11aee
    • S
      KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt · 3073774e
      Serhii Popovych 提交于
      Currently the kvm_resize_hpt structure has two fields relevant to the
      state of an ongoing resize: 'prepare_done', which indicates whether
      the worker thread has completed or not, and 'error' which indicates
      whether it was successful or not.
      
      Since the success/failure isn't known until completion, this is
      confusingly redundant.  This patch consolidates the information into
      just the 'error' value: -EBUSY indicates the worked is still in
      progress, other negative values indicate (completed) failure, 0
      indicates successful completion.
      
      As a bonus this reduces size of struct kvm_resize_hpt by
      __alignof__(struct kvm_hpt_info) and saves few bytes of code.
      
      While there correct comment in struct kvm_resize_hpt which references
      a non-existent semaphore (leftover from an early draft).
      
      Assert with WARN_ON() in case of HPT allocation thread work runs more
      than once for resize request or resize_hpt_allocate() returns -EBUSY
      that is treated specially.
      
      Change comparison against zero to make checkpatch.pl happy.
      
      Cc: stable@vger.kernel.org # v4.10+
      Signed-off-by: NSerhii Popovych <spopovyc@redhat.com>
      [dwg: Changed BUG_ON()s to WARN_ON()s and altered commit message for
       clarity]
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      3073774e
  8. 23 11月, 2017 1 次提交
    • P
      KVM: PPC: Book3S HV: Fix migration and HPT resizing of HPT guests on radix hosts · ded13fc1
      Paul Mackerras 提交于
      This fixes two errors that prevent a guest using the HPT MMU from
      successfully migrating to a POWER9 host in radix MMU mode, or resizing
      its HPT when running on a radix host.
      
      The first bug was that commit 8dc6cca5 ("KVM: PPC: Book3S HV:
      Don't rely on host's page size information", 2017-09-11) missed two
      uses of hpte_base_page_size(), one in the HPT rehashing code and
      one in kvm_htab_write() (which is used on the destination side in
      migrating a HPT guest).  Instead we use kvmppc_hpte_base_page_shift().
      Having the shift count means that we can use left and right shifts
      instead of multiplication and division in a few places.
      
      Along the way, this adds a check in kvm_htab_write() to ensure that the
      page size encoding in the incoming HPTEs is recognized, and if not
      return an EINVAL error to userspace.
      
      The second bug was that kvm_htab_write was performing some but not all
      of the functions of kvmhv_setup_mmu(), resulting in the destination VM
      being left in radix mode as far as the hardware is concerned.  The
      simplest fix for now is make kvm_htab_write() call
      kvmppc_setup_partition_table() like kvmppc_hv_setup_htab_rma() does.
      In future it would be better to refactor the code more extensively
      to remove the duplication.
      
      Fixes: 8dc6cca5 ("KVM: PPC: Book3S HV: Don't rely on host's page size information")
      Fixes: 7a84084c ("KVM: PPC: Book3S HV: Set partition table rather than SDR1 on POWER9")
      Reported-by: NSuraj Jitindar Singh <sjitindarsingh@gmail.com>
      Tested-by: NSuraj Jitindar Singh <sjitindarsingh@gmail.com>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      ded13fc1
  9. 08 11月, 2017 1 次提交
    • P
      KVM: PPC: Book3S HV: Fix exclusion between HPT resizing and other HPT updates · 38c53af8
      Paul Mackerras 提交于
      Commit 5e985969 ("KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing
      implementation", 2016-12-20) added code that tries to exclude any use
      or update of the hashed page table (HPT) while the HPT resizing code
      is iterating through all the entries in the HPT.  It does this by
      taking the kvm->lock mutex, clearing the kvm->arch.hpte_setup_done
      flag and then sending an IPI to all CPUs in the host.  The idea is
      that any VCPU task that tries to enter the guest will see that the
      hpte_setup_done flag is clear and therefore call kvmppc_hv_setup_htab_rma,
      which also takes the kvm->lock mutex and will therefore block until
      we release kvm->lock.
      
      However, any VCPU that is already in the guest, or is handling a
      hypervisor page fault or hypercall, can re-enter the guest without
      rechecking the hpte_setup_done flag.  The IPI will cause a guest exit
      of any VCPUs that are currently in the guest, but does not prevent
      those VCPU tasks from immediately re-entering the guest.
      
      The result is that after resize_hpt_rehash_hpte() has made a HPTE
      absent, a hypervisor page fault can occur and make that HPTE present
      again.  This includes updating the rmap array for the guest real page,
      meaning that we now have a pointer in the rmap array which connects
      with pointers in the old rev array but not the new rev array.  In
      fact, if the HPT is being reduced in size, the pointer in the rmap
      array could point outside the bounds of the new rev array.  If that
      happens, we can get a host crash later on such as this one:
      
      [91652.628516] Unable to handle kernel paging request for data at address 0xd0000000157fb10c
      [91652.628668] Faulting instruction address: 0xc0000000000e2640
      [91652.628736] Oops: Kernel access of bad area, sig: 11 [#1]
      [91652.628789] LE SMP NR_CPUS=1024 NUMA PowerNV
      [91652.628847] Modules linked in: binfmt_misc vhost_net vhost tap xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack libcrc32c iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables ses enclosure scsi_transport_sas i2c_opal ipmi_powernv ipmi_devintf i2c_core ipmi_msghandler powernv_op_panel nfsd auth_rpcgss oid_registry nfs_acl lockd grace sunrpc kvm_hv kvm_pr kvm scsi_dh_alua dm_service_time dm_multipath tg3 ptp pps_core [last unloaded: stap_552b612747aec2da355051e464fa72a1_14259]
      [91652.629566] CPU: 136 PID: 41315 Comm: CPU 21/KVM Tainted: G           O    4.14.0-1.rc4.dev.gitb27fc5c.el7.centos.ppc64le #1
      [91652.629684] task: c0000007a419e400 task.stack: c0000000028d8000
      [91652.629750] NIP:  c0000000000e2640 LR: d00000000c36e498 CTR: c0000000000e25f0
      [91652.629829] REGS: c0000000028db5d0 TRAP: 0300   Tainted: G           O     (4.14.0-1.rc4.dev.gitb27fc5c.el7.centos.ppc64le)
      [91652.629932] MSR:  900000010280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 44022422  XER: 00000000
      [91652.630034] CFAR: d00000000c373f84 DAR: d0000000157fb10c DSISR: 40000000 SOFTE: 1
      [91652.630034] GPR00: d00000000c36e498 c0000000028db850 c000000001403900 c0000007b7960000
      [91652.630034] GPR04: d0000000117fb100 d000000007ab00d8 000000000033bb10 0000000000000000
      [91652.630034] GPR08: fffffffffffffe7f 801001810073bb10 d00000000e440000 d00000000c373f70
      [91652.630034] GPR12: c0000000000e25f0 c00000000fdb9400 f000000003b24680 0000000000000000
      [91652.630034] GPR16: 00000000000004fb 00007ff7081a0000 00000000000ec91a 000000000033bb10
      [91652.630034] GPR20: 0000000000010000 00000000001b1190 0000000000000001 0000000000010000
      [91652.630034] GPR24: c0000007b7ab8038 d0000000117fb100 0000000ec91a1190 c000001e6a000000
      [91652.630034] GPR28: 00000000033bb100 000000000073bb10 c0000007b7960000 d0000000157fb100
      [91652.630735] NIP [c0000000000e2640] kvmppc_add_revmap_chain+0x50/0x120
      [91652.630806] LR [d00000000c36e498] kvmppc_book3s_hv_page_fault+0xbb8/0xc40 [kvm_hv]
      [91652.630884] Call Trace:
      [91652.630913] [c0000000028db850] [c0000000028db8b0] 0xc0000000028db8b0 (unreliable)
      [91652.630996] [c0000000028db8b0] [d00000000c36e498] kvmppc_book3s_hv_page_fault+0xbb8/0xc40 [kvm_hv]
      [91652.631091] [c0000000028db9e0] [d00000000c36a078] kvmppc_vcpu_run_hv+0xdf8/0x1300 [kvm_hv]
      [91652.631179] [c0000000028dbb30] [d00000000c2248c4] kvmppc_vcpu_run+0x34/0x50 [kvm]
      [91652.631266] [c0000000028dbb50] [d00000000c220d54] kvm_arch_vcpu_ioctl_run+0x114/0x2a0 [kvm]
      [91652.631351] [c0000000028dbbd0] [d00000000c2139d8] kvm_vcpu_ioctl+0x598/0x7a0 [kvm]
      [91652.631433] [c0000000028dbd40] [c0000000003832e0] do_vfs_ioctl+0xd0/0x8c0
      [91652.631501] [c0000000028dbde0] [c000000000383ba4] SyS_ioctl+0xd4/0x130
      [91652.631569] [c0000000028dbe30] [c00000000000b8e0] system_call+0x58/0x6c
      [91652.631635] Instruction dump:
      [91652.631676] fba1ffe8 fbc1fff0 fbe1fff8 f8010010 f821ffa1 2fa70000 793d0020 e9432110
      [91652.631814] 7bbf26e4 7c7e1b78 7feafa14 409e0094 <807f000c> 786326e4 7c6a1a14 93a40008
      [91652.631959] ---[ end trace ac85ba6db72e5b2e ]---
      
      To fix this, we tighten up the way that the hpte_setup_done flag is
      checked to ensure that it does provide the guarantee that the resizing
      code needs.  In kvmppc_run_core(), we check the hpte_setup_done flag
      after disabling interrupts and refuse to enter the guest if it is
      clear (for a HPT guest).  The code that checks hpte_setup_done and
      calls kvmppc_hv_setup_htab_rma() is moved from kvmppc_vcpu_run_hv()
      to a point inside the main loop in kvmppc_run_vcpu(), ensuring that
      we don't just spin endlessly calling kvmppc_run_core() while
      hpte_setup_done is clear, but instead have a chance to block on the
      kvm->lock mutex.
      
      Finally we also check hpte_setup_done inside the region in
      kvmppc_book3s_hv_page_fault() where the HPTE is locked and we are about
      to update the HPTE, and bail out if it is clear.  If another CPU is
      inside kvm_vm_ioctl_resize_hpt_commit) and has cleared hpte_setup_done,
      then we know that either we are looking at a HPTE
      that resize_hpt_rehash_hpte() has not yet processed, which is OK,
      or else we will see hpte_setup_done clear and refuse to update it,
      because of the full barrier formed by the unlock of the HPTE in
      resize_hpt_rehash_hpte() combined with the locking of the HPTE
      in kvmppc_book3s_hv_page_fault().
      
      Fixes: 5e985969 ("KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing implementation")
      Cc: stable@vger.kernel.org # v4.10+
      Reported-by: NSatheesh Rajendran <satheera@in.ibm.com>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      38c53af8
  10. 01 11月, 2017 4 次提交
    • P
      KVM: PPC: Book3S HV: Add infrastructure for running HPT guests on radix host · 18c3640c
      Paul Mackerras 提交于
      This sets up the machinery for switching a guest between HPT (hashed
      page table) and radix MMU modes, so that in future we can run a HPT
      guest on a radix host on POWER9 machines.
      
      * The KVM_PPC_CONFIGURE_V3_MMU ioctl can now specify either HPT or
        radix mode, on a radix host.
      
      * The KVM_CAP_PPC_MMU_HASH_V3 capability now returns 1 on POWER9
        with HV KVM on a radix host.
      
      * The KVM_PPC_GET_SMMU_INFO returns information about the HPT MMU on a
        radix host.
      
      * The KVM_PPC_ALLOCATE_HTAB ioctl on a radix host will switch the
        guest to HPT mode and allocate a HPT.
      
      * For simplicity, we now allocate the rmap array for each memslot,
        even on a radix host, since it will be needed if the guest switches
        to HPT mode.
      
      * Since we cannot yet run a HPT guest on a radix host, the KVM_RUN
        ioctl will return an EINVAL error in that case.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      18c3640c
    • P
      KVM: PPC: Book3S HV: Unify dirty page map between HPT and radix · e641a317
      Paul Mackerras 提交于
      Currently, the HPT code in HV KVM maintains a dirty bit per guest page
      in the rmap array, whether or not dirty page tracking has been enabled
      for the memory slot.  In contrast, the radix code maintains a dirty
      bit per guest page in memslot->dirty_bitmap, and only does so when
      dirty page tracking has been enabled.
      
      This changes the HPT code to maintain the dirty bits in the memslot
      dirty_bitmap like radix does.  This results in slightly less code
      overall, and will mean that we do not lose the dirty bits when
      transitioning between HPT and radix mode in future.
      
      There is one minor change to behaviour as a result.  With HPT, when
      dirty tracking was enabled for a memslot, we would previously clear
      all the dirty bits at that point (both in the HPT entries and in the
      rmap arrays), meaning that a KVM_GET_DIRTY_LOG ioctl immediately
      following would show no pages as dirty (assuming no vcpus have run
      in the meantime).  With this change, the dirty bits on HPT entries
      are not cleared at the point where dirty tracking is enabled, so
      KVM_GET_DIRTY_LOG would show as dirty any guest pages that are
      resident in the HPT and dirty.  This is consistent with what happens
      on radix.
      
      This also fixes a bug in the mark_pages_dirty() function for radix
      (in the sense that the function no longer exists).  In the case where
      a large page of 64 normal pages or more is marked dirty, the
      addressing of the dirty bitmap was incorrect and could write past
      the end of the bitmap.  Fortunately this case was never hit in
      practice because a 2MB large page is only 32 x 64kB pages, and we
      don't support backing the guest with 1GB huge pages at this point.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      e641a317
    • P
      KVM: PPC: Book3S HV: Rename hpte_setup_done to mmu_ready · 1b151ce4
      Paul Mackerras 提交于
      This renames the kvm->arch.hpte_setup_done field to mmu_ready because
      we will want to use it for radix guests too -- both for setting things
      up before vcpu execution, and for excluding vcpus from executing while
      MMU-related things get changed, such as in future switching the MMU
      from radix to HPT mode or vice-versa.
      
      This also moves the call to kvmppc_setup_partition_table() that was
      done in kvmppc_hv_setup_htab_rma() for HPT guests, and the setting
      of mmu_ready, into the caller in kvmppc_vcpu_run_hv().
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      1b151ce4
    • P
      KVM: PPC: Book3S HV: Don't rely on host's page size information · 8dc6cca5
      Paul Mackerras 提交于
      This removes the dependence of KVM on the mmu_psize_defs array (which
      stores information about hardware support for various page sizes) and
      the things derived from it, chiefly hpte_page_sizes[], hpte_page_size(),
      hpte_actual_page_size() and get_sllp_encoding().  We also no longer
      rely on the mmu_slb_size variable or the MMU_FTR_1T_SEGMENTS feature
      bit.
      
      The reason for doing this is so we can support a HPT guest on a radix
      host.  In a radix host, the mmu_psize_defs array contains information
      about page sizes supported by the MMU in radix mode rather than the
      page sizes supported by the MMU in HPT mode.  Similarly, mmu_slb_size
      and the MMU_FTR_1T_SEGMENTS bit are not set.
      
      Instead we hard-code knowledge of the behaviour of the HPT MMU in the
      POWER7, POWER8 and POWER9 processors (which are the only processors
      supported by HV KVM) - specifically the encoding of the LP fields in
      the HPT and SLB entries, and the fact that they have 32 SLB entries
      and support 1TB segments.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      8dc6cca5
  11. 16 10月, 2017 1 次提交
    • P
      KVM: PPC: Book3S HV: Explicitly disable HPT operations on radix guests · 891f1ebf
      Paul Mackerras 提交于
      This adds code to make sure that we don't try to access the
      non-existent HPT for a radix guest using the htab file for the VM
      in debugfs, a file descriptor obtained using the KVM_PPC_GET_HTAB_FD
      ioctl, or via the KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} ioctls.
      
      At present nothing bad happens if userspace does access these
      interfaces on a radix guest, mostly because kvmppc_hpt_npte()
      gives 0 for a radix guest, which in turn is because 1 << -4
      comes out as 0 on POWER processors.  However, that relies on
      undefined behaviour, so it is better to be explicit about not
      accessing the HPT for a radix guest.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      891f1ebf
  12. 14 10月, 2017 1 次提交
  13. 01 9月, 2017 1 次提交
  14. 17 8月, 2017 1 次提交
    • A
      powerpc/mm: Rename find_linux_pte_or_hugepte() · 94171b19
      Aneesh Kumar K.V 提交于
      Add newer helpers to make the function usage simpler. It is always
      recommended to use find_current_mm_pte() for walking the page table.
      If we cannot use find_current_mm_pte(), it should be documented why
      the said usage of __find_linux_pte() is safe against a parallel THP
      split.
      
      For now we have KVM code using __find_linux_pte(). This is because kvm
      code ends up calling __find_linux_pte() in real mode with MSR_EE=0 but
      with PACA soft_enabled = 1. We may want to fix that later and make
      sure we keep the MSR_EE and PACA soft_enabled in sync. When we do that
      we can switch kvm to use find_linux_pte().
      Signed-off-by: NAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      94171b19
  15. 24 7月, 2017 1 次提交
    • P
      KVM: PPC: Book3S HV: Fix host crash on changing HPT size · ef427198
      Paul Mackerras 提交于
      Commit f98a8bf9 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB
      ioctl() to change HPT size", 2016-12-20) changed the behaviour of
      the KVM_PPC_ALLOCATE_HTAB ioctl so that it now allocates a new HPT
      and new revmap array if there was a previously-allocated HPT of a
      different size from the size being requested.  In this case, we need
      to reset the rmap arrays of the memslots, because the rmap arrays
      will contain references to HPTEs which are no longer valid.  Worse,
      these references are also references to slots in the new revmap
      array (which parallels the HPT), and the new revmap array contains
      random contents, since it doesn't get zeroed on allocation.
      
      The effect of having these stale references to slots in the revmap
      array that contain random contents is that subsequent calls to
      functions such as kvmppc_add_revmap_chain will crash because they
      will interpret the non-zero contents of the revmap array as HPTE
      indexes and thus index outside of the revmap array.  This leads to
      host crashes such as the following.
      
      [ 7072.862122] Unable to handle kernel paging request for data at address 0xd000000c250c00f8
      [ 7072.862218] Faulting instruction address: 0xc0000000000e1c78
      [ 7072.862233] Oops: Kernel access of bad area, sig: 11 [#1]
      [ 7072.862286] SMP NR_CPUS=1024
      [ 7072.862286] NUMA
      [ 7072.862325] PowerNV
      [ 7072.862378] Modules linked in: kvm_hv vhost_net vhost tap xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm iw_cxgb3 mlx5_ib ib_core ses enclosure scsi_transport_sas ipmi_powernv ipmi_devintf ipmi_msghandler powernv_op_panel i2c_opal nfsd auth_rpcgss oid_registry
      [ 7072.863085]  nfs_acl lockd grace sunrpc kvm_pr kvm xfs libcrc32c scsi_dh_alua dm_service_time radeon lpfc nvme_fc nvme_fabrics nvme_core scsi_transport_fc i2c_algo_bit tg3 drm_kms_helper ptp pps_core syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm dm_multipath i2c_core cxgb3 mlx5_core mdio [last unloaded: kvm_hv]
      [ 7072.863381] CPU: 72 PID: 56929 Comm: qemu-system-ppc Not tainted 4.12.0-kvm+ #59
      [ 7072.863457] task: c000000fe29e7600 task.stack: c000001e3ffec000
      [ 7072.863520] NIP: c0000000000e1c78 LR: c0000000000e2e3c CTR: c0000000000e25f0
      [ 7072.863596] REGS: c000001e3ffef560 TRAP: 0300   Not tainted  (4.12.0-kvm+)
      [ 7072.863658] MSR: 9000000100009033 <SF,HV,EE,ME,IR,DR,RI,LE,TM[E]>
      [ 7072.863667]   CR: 44082882  XER: 20000000
      [ 7072.863767] CFAR: c0000000000e2e38 DAR: d000000c250c00f8 DSISR: 42000000 SOFTE: 1
      GPR00: c0000000000e2e3c c000001e3ffef7e0 c000000001407d00 d000000c250c00f0
      GPR04: d00000006509fb70 d00000000b3d2048 0000000003ffdfb7 0000000000000000
      GPR08: 00000001007fdfb7 00000000c000000f d0000000250c0000 000000000070f7bf
      GPR12: 0000000000000008 c00000000fdad000 0000000010879478 00000000105a0d78
      GPR16: 00007ffaf4080000 0000000000001190 0000000000000000 0000000000010000
      GPR20: 4001ffffff000415 d00000006509fb70 0000000004091190 0000000ee1881190
      GPR24: 0000000003ffdfb7 0000000003ffdfb7 00000000007fdfb7 c000000f5c958000
      GPR28: d00000002d09fb70 0000000003ffdfb7 d00000006509fb70 d00000000b3d2048
      [ 7072.864439] NIP [c0000000000e1c78] kvmppc_add_revmap_chain+0x88/0x130
      [ 7072.864503] LR [c0000000000e2e3c] kvmppc_do_h_enter+0x84c/0x9e0
      [ 7072.864566] Call Trace:
      [ 7072.864594] [c000001e3ffef7e0] [c000001e3ffef830] 0xc000001e3ffef830 (unreliable)
      [ 7072.864671] [c000001e3ffef830] [c0000000000e2e3c] kvmppc_do_h_enter+0x84c/0x9e0
      [ 7072.864751] [c000001e3ffef920] [d00000000b38d878] kvmppc_map_vrma+0x168/0x200 [kvm_hv]
      [ 7072.864831] [c000001e3ffef9e0] [d00000000b38a684] kvmppc_vcpu_run_hv+0x1284/0x1300 [kvm_hv]
      [ 7072.864914] [c000001e3ffefb30] [d00000000f465664] kvmppc_vcpu_run+0x44/0x60 [kvm]
      [ 7072.865008] [c000001e3ffefb60] [d00000000f461864] kvm_arch_vcpu_ioctl_run+0x114/0x290 [kvm]
      [ 7072.865152] [c000001e3ffefbe0] [d00000000f453c98] kvm_vcpu_ioctl+0x598/0x7a0 [kvm]
      [ 7072.865292] [c000001e3ffefd40] [c000000000389328] do_vfs_ioctl+0xd8/0x8c0
      [ 7072.865410] [c000001e3ffefde0] [c000000000389be4] SyS_ioctl+0xd4/0x130
      [ 7072.865526] [c000001e3ffefe30] [c00000000000b760] system_call+0x58/0x6c
      [ 7072.865644] Instruction dump:
      [ 7072.865715] e95b2110 793a0020 7b4926e4 7f8a4a14 409e0098 807c000c 786326e4 7c6a1a14
      [ 7072.865857] 935e0008 7bbd0020 813c000c 913e000c <93a30008> 93bc000c 48000038 60000000
      [ 7072.866001] ---[ end trace 627b6e4bf8080edc ]---
      
      Note that to trigger this, it is necessary to use a recent upstream
      QEMU (or other userspace that resizes the HPT at CAS time), specify
      a maximum memory size substantially larger than the current memory
      size, and boot a guest kernel that does not support HPT resizing.
      
      This fixes the problem by resetting the rmap arrays when the old HPT
      is freed.
      
      Fixes: f98a8bf9 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size")
      Cc: stable@vger.kernel.org # v4.11+
      Reviewed-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      ef427198
  16. 13 7月, 2017 1 次提交
    • M
      mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic · dcda9b04
      Michal Hocko 提交于
      __GFP_REPEAT was designed to allow retry-but-eventually-fail semantic to
      the page allocator.  This has been true but only for allocations
      requests larger than PAGE_ALLOC_COSTLY_ORDER.  It has been always
      ignored for smaller sizes.  This is a bit unfortunate because there is
      no way to express the same semantic for those requests and they are
      considered too important to fail so they might end up looping in the
      page allocator for ever, similarly to GFP_NOFAIL requests.
      
      Now that the whole tree has been cleaned up and accidental or misled
      usage of __GFP_REPEAT flag has been removed for !costly requests we can
      give the original flag a better name and more importantly a more useful
      semantic.  Let's rename it to __GFP_RETRY_MAYFAIL which tells the user
      that the allocator would try really hard but there is no promise of a
      success.  This will work independent of the order and overrides the
      default allocator behavior.  Page allocator users have several levels of
      guarantee vs.  cost options (take GFP_KERNEL as an example)
      
       - GFP_KERNEL & ~__GFP_RECLAIM - optimistic allocation without _any_
         attempt to free memory at all. The most light weight mode which even
         doesn't kick the background reclaim. Should be used carefully because
         it might deplete the memory and the next user might hit the more
         aggressive reclaim
      
       - GFP_KERNEL & ~__GFP_DIRECT_RECLAIM (or GFP_NOWAIT)- optimistic
         allocation without any attempt to free memory from the current
         context but can wake kswapd to reclaim memory if the zone is below
         the low watermark. Can be used from either atomic contexts or when
         the request is a performance optimization and there is another
         fallback for a slow path.
      
       - (GFP_KERNEL|__GFP_HIGH) & ~__GFP_DIRECT_RECLAIM (aka GFP_ATOMIC) -
         non sleeping allocation with an expensive fallback so it can access
         some portion of memory reserves. Usually used from interrupt/bh
         context with an expensive slow path fallback.
      
       - GFP_KERNEL - both background and direct reclaim are allowed and the
         _default_ page allocator behavior is used. That means that !costly
         allocation requests are basically nofail but there is no guarantee of
         that behavior so failures have to be checked properly by callers
         (e.g. OOM killer victim is allowed to fail currently).
      
       - GFP_KERNEL | __GFP_NORETRY - overrides the default allocator behavior
         and all allocation requests fail early rather than cause disruptive
         reclaim (one round of reclaim in this implementation). The OOM killer
         is not invoked.
      
       - GFP_KERNEL | __GFP_RETRY_MAYFAIL - overrides the default allocator
         behavior and all allocation requests try really hard. The request
         will fail if the reclaim cannot make any progress. The OOM killer
         won't be triggered.
      
       - GFP_KERNEL | __GFP_NOFAIL - overrides the default allocator behavior
         and all allocation requests will loop endlessly until they succeed.
         This might be really dangerous especially for larger orders.
      
      Existing users of __GFP_REPEAT are changed to __GFP_RETRY_MAYFAIL
      because they already had their semantic.  No new users are added.
      __alloc_pages_slowpath is changed to bail out for __GFP_RETRY_MAYFAIL if
      there is no progress and we have already passed the OOM point.
      
      This means that all the reclaim opportunities have been exhausted except
      the most disruptive one (the OOM killer) and a user defined fallback
      behavior is more sensible than keep retrying in the page allocator.
      
      [akpm@linux-foundation.org: fix arch/sparc/kernel/mdesc.c]
      [mhocko@suse.com: semantic fix]
        Link: http://lkml.kernel.org/r/20170626123847.GM11534@dhcp22.suse.cz
      [mhocko@kernel.org: address other thing spotted by Vlastimil]
        Link: http://lkml.kernel.org/r/20170626124233.GN11534@dhcp22.suse.cz
      Link: http://lkml.kernel.org/r/20170623085345.11304-3-mhocko@kernel.orgSigned-off-by: NMichal Hocko <mhocko@suse.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Alex Belits <alex.belits@cavium.com>
      Cc: Chris Wilson <chris@chris-wilson.co.uk>
      Cc: Christoph Hellwig <hch@infradead.org>
      Cc: Darrick J. Wong <darrick.wong@oracle.com>
      Cc: David Daney <david.daney@cavium.com>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: NeilBrown <neilb@suse.com>
      Cc: Ralf Baechle <ralf@linux-mips.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      dcda9b04
  17. 06 4月, 2017 1 次提交
  18. 10 3月, 2017 1 次提交
  19. 18 2月, 2017 1 次提交
  20. 17 2月, 2017 1 次提交
  21. 16 2月, 2017 1 次提交
    • D
      KVM: PPC: Book3S HV: Prevent double-free on HPT resize commit path · 5b73d634
      David Gibson 提交于
      resize_hpt_release(), called once the HPT resize of a KVM guest is
      completed (successfully or unsuccessfully) frees the state structure for
      the resize.  It is currently not safe to call with a NULL pointer.
      
      However, one of the error paths in kvm_vm_ioctl_resize_hpt_commit() can
      invoke it with a NULL pointer.  This will occur if userspace improperly
      invokes KVM_PPC_RESIZE_HPT_COMMIT without previously calling
      KVM_PPC_RESIZE_HPT_PREPARE, or if it calls COMMIT twice without an
      intervening PREPARE.
      
      To fix this potential crash bug - and maybe others like it, make it safe
      (and a no-op) to call resize_hpt_release() with a NULL resize pointer.
      
      Found by Dan Carpenter with a static checker.
      Reported-by: NDan Carpenter <dan.carpenter@oracle.com>
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      5b73d634
  22. 31 1月, 2017 13 次提交
    • D
      KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation · b5baa687
      David Gibson 提交于
      This adds the "guts" of the implementation for the HPT resizing PAPR
      extension.  It has the code to allocate and clear a new HPT, rehash an
      existing HPT's entries into it, and accomplish the switchover for a
      KVM guest from the old HPT to the new one.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      b5baa687
    • D
      KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing implementation · 5e985969
      David Gibson 提交于
      This adds a not yet working outline of the HPT resizing PAPR
      extension.  Specifically it adds the necessary ioctl() functions,
      their basic steps, the work function which will handle preparation for
      the resize, and synchronization between these, the guest page fault
      path and guest HPT update path.
      
      The actual guts of the implementation isn't here yet, so for now the
      calls will always fail.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      5e985969
    • D
      KVM: PPC: Book3S HV: Create kvmppc_unmap_hpte_helper() · 639e4597
      David Gibson 提交于
      The kvm_unmap_rmapp() function, called from certain MMU notifiers, is used
      to force all guest mappings of a particular host page to be set ABSENT, and
      removed from the reverse mappings.
      
      For HPT resizing, we will have some cases where we want to set just a
      single guest HPTE ABSENT and remove its reverse mappings.  To prepare with
      this, we split out the logic from kvm_unmap_rmapp() to evict a single HPTE,
      moving it to a new helper function.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      639e4597
    • D
      KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size · f98a8bf9
      David Gibson 提交于
      The KVM_PPC_ALLOCATE_HTAB ioctl() is used to set the size of hashed page
      table (HPT) that userspace expects a guest VM to have, and is also used to
      clear that HPT when necessary (e.g. guest reboot).
      
      At present, once the ioctl() is called for the first time, the HPT size can
      never be changed thereafter - it will be cleared but always sized as from
      the first call.
      
      With upcoming HPT resize implementation, we're going to need to allow
      userspace to resize the HPT at reset (to change it back to the default size
      if the guest changed it).
      
      So, we need to allow this ioctl() to change the HPT size.
      
      This patch also updates Documentation/virtual/kvm/api.txt to reflect
      the new behaviour.  In fact the documentation was already slightly
      incorrect since 572abd56 "KVM: PPC: Book3S HV: Don't fall back to
      smaller HPT size in allocation ioctl"
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      f98a8bf9
    • D
      KVM: PPC: Book3S HV: Split HPT allocation from activation · aae0777f
      David Gibson 提交于
      Currently, kvmppc_alloc_hpt() both allocates a new hashed page table (HPT)
      and sets it up as the active page table for a VM.  For the upcoming HPT
      resize implementation we're going to want to allocate HPTs separately from
      activating them.
      
      So, split the allocation itself out into kvmppc_allocate_hpt() and perform
      the activation with a new kvmppc_set_hpt() function.  Likewise we split
      kvmppc_free_hpt(), which just frees the HPT, from kvmppc_release_hpt()
      which unsets it as an active HPT, then frees it.
      
      We also move the logic to fall back to smaller HPT sizes if the first try
      fails into the single caller which used that behaviour,
      kvmppc_hv_setup_htab_rma().  This introduces a slight semantic change, in
      that previously if the initial attempt at CMA allocation failed, we would
      fall back to attempting smaller sizes with the page allocator.  Now, we
      try first CMA, then the page allocator at each size.  As far as I can tell
      this change should be harmless.
      
      To match, we make kvmppc_free_hpt() just free the actual HPT itself.  The
      call to kvmppc_free_lpid() that was there, we move to the single caller.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      aae0777f
    • D
      KVM: PPC: Book3S HV: Don't store values derivable from HPT order · 3d089f84
      David Gibson 提交于
      Currently the kvm_hpt_info structure stores the hashed page table's order,
      and also the number of HPTEs it contains and a mask for its size.  The
      last two can be easily derived from the order, so remove them and just
      calculate them as necessary with a couple of helper inlines.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Reviewed-by: NThomas Huth <thuth@redhat.com>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      3d089f84
    • D
      KVM: PPC: Book3S HV: Gather HPT related variables into sub-structure · 3f9d4f5a
      David Gibson 提交于
      Currently, the powerpc kvm_arch structure contains a number of variables
      tracking the state of the guest's hashed page table (HPT) in KVM HV.  This
      patch gathers them all together into a single kvm_hpt_info substructure.
      This makes life more convenient for the upcoming HPT resizing
      implementation.
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      3f9d4f5a
    • D
      KVM: PPC: Book3S HV: Rename kvm_alloc_hpt() for clarity · db9a290d
      David Gibson 提交于
      The difference between kvm_alloc_hpt() and kvmppc_alloc_hpt() is not at
      all obvious from the name.  In practice kvmppc_alloc_hpt() allocates an HPT
      by whatever means, and calls kvm_alloc_hpt() which will attempt to allocate
      it with CMA only.
      
      To make this less confusing, rename kvm_alloc_hpt() to kvm_alloc_hpt_cma().
      Similarly, kvm_release_hpt() is renamed kvm_free_hpt_cma().
      Signed-off-by: NDavid Gibson <david@gibson.dropbear.id.au>
      Reviewed-by: NThomas Huth <thuth@redhat.com>
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      db9a290d
    • P
      KVM: PPC: Book3S HV: Enable radix guest support · 8cf4ecc0
      Paul Mackerras 提交于
      This adds a few last pieces of the support for radix guests:
      
      * Implement the backends for the KVM_PPC_CONFIGURE_V3_MMU and
        KVM_PPC_GET_RMMU_INFO ioctls for radix guests
      
      * On POWER9, allow secondary threads to be on/off-lined while guests
        are running.
      
      * Set up LPCR and the partition table entry for radix guests.
      
      * Don't allocate the rmap array in the kvm_memory_slot structure
        on radix.
      
      * Don't try to initialize the HPT for radix guests, since they don't
        have an HPT.
      
      * Take out the code that prevents the HV KVM module from
        initializing on radix hosts.
      
      At this stage, we only support radix guests if the host is running
      in radix mode, and only support HPT guests if the host is running in
      HPT mode.  Thus a guest cannot switch from one mode to the other,
      which enables some simplifications.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      8cf4ecc0
    • P
      KVM: PPC: Book3S HV: Implement dirty page logging for radix guests · 8f7b79b8
      Paul Mackerras 提交于
      This adds code to keep track of dirty pages when requested (that is,
      when memslot->dirty_bitmap is non-NULL) for radix guests.  We use the
      dirty bits in the PTEs in the second-level (partition-scoped) page
      tables, together with a bitmap of pages that were dirty when their
      PTE was invalidated (e.g., when the page was paged out).  This bitmap
      is stored in the first half of the memslot->dirty_bitmap area, and
      kvm_vm_ioctl_get_dirty_log_hv() now uses the second half for the
      bitmap that gets returned to userspace.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      8f7b79b8
    • P
      KVM: PPC: Book3S HV: MMU notifier callbacks for radix guests · 01756099
      Paul Mackerras 提交于
      This adapts our implementations of the MMU notifier callbacks
      (unmap_hva, unmap_hva_range, age_hva, test_age_hva, set_spte_hva)
      to call radix functions when the guest is using radix.  These
      implementations are much simpler than for HPT guests because we
      have only one PTE to deal with, so we don't need to traverse
      rmap chains.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      01756099
    • P
      KVM: PPC: Book3S HV: Page table construction and page faults for radix guests · 5a319350
      Paul Mackerras 提交于
      This adds the code to construct the second-level ("partition-scoped" in
      architecturese) page tables for guests using the radix MMU.  Apart from
      the PGD level, which is allocated when the guest is created, the rest
      of the tree is all constructed in response to hypervisor page faults.
      
      As well as hypervisor page faults for missing pages, we also get faults
      for reference/change (RC) bits needing to be set, as well as various
      other error conditions.  For now, we only set the R or C bit in the
      guest page table if the same bit is set in the host PTE for the
      backing page.
      
      This code can take advantage of the guest being backed with either
      transparent or ordinary 2MB huge pages, and insert 2MB page entries
      into the guest page tables.  There is no support for 1GB huge pages
      yet.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      5a319350
    • P
      KVM: PPC: Book3S HV: Add basic infrastructure for radix guests · 9e04ba69
      Paul Mackerras 提交于
      This adds a field in struct kvm_arch and an inline helper to
      indicate whether a guest is a radix guest or not, plus a new file
      to contain the radix MMU code, which currently contains just a
      translate function which knows how to traverse the guest page
      tables to translate an address.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
      9e04ba69
  23. 24 11月, 2016 1 次提交
    • P
      KVM: PPC: Book3S HV: Adapt to new HPTE format on POWER9 · abb7c7dd
      Paul Mackerras 提交于
      This adapts the KVM-HV hashed page table (HPT) code to read and write
      HPT entries in the new format defined in Power ISA v3.00 on POWER9
      machines.  The new format moves the B (segment size) field from the
      first doubleword to the second, and trims some bits from the AVA
      (abbreviated virtual address) and ARPN (abbreviated real page number)
      fields.  As far as possible, the conversion is done when reading or
      writing the HPT entries, and the rest of the code continues to use
      the old format.
      Signed-off-by: NPaul Mackerras <paulus@ozlabs.org>
      abb7c7dd