1. 13 6月, 2018 1 次提交
    • K
      treewide: kzalloc() -> kcalloc() · 6396bb22
      Kees Cook 提交于
      The kzalloc() function has a 2-factor argument form, kcalloc(). This
      patch replaces cases of:
      
              kzalloc(a * b, gfp)
      
      with:
              kcalloc(a * b, gfp)
      
      as well as handling cases of:
      
              kzalloc(a * b * c, gfp)
      
      with:
      
              kzalloc(array3_size(a, b, c), gfp)
      
      as it's slightly less ugly than:
      
              kzalloc_array(array_size(a, b), c, gfp)
      
      This does, however, attempt to ignore constant size factors like:
      
              kzalloc(4 * 1024, gfp)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        kzalloc(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        kzalloc(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        kzalloc(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (COUNT_ID)
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * COUNT_ID
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * COUNT_CONST
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (COUNT_ID)
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * COUNT_ID
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * COUNT_CONST
      +	COUNT_CONST, sizeof(THING)
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
      - kzalloc
      + kcalloc
        (
      -	SIZE * COUNT
      +	COUNT, SIZE
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        kzalloc(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        kzalloc(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        kzalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        kzalloc(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products,
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc(C1 * C2 * C3, ...)
      |
        kzalloc(
      -	(E1) * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	(E1) * (E2) * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	(E1) * (E2) * (E3)
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants,
      // keeping sizeof() as the second factor argument.
      @@
      expression THING, E1, E2;
      type TYPE;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc(sizeof(THING) * C2, ...)
      |
        kzalloc(sizeof(TYPE) * C2, ...)
      |
        kzalloc(C1 * C2 * C3, ...)
      |
        kzalloc(C1 * C2, ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * (E2)
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(TYPE) * E2
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * (E2)
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	sizeof(THING) * E2
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	(E1) * E2
      +	E1, E2
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	(E1) * (E2)
      +	E1, E2
        , ...)
      |
      - kzalloc
      + kcalloc
        (
      -	E1 * E2
      +	E1, E2
        , ...)
      )
      Signed-off-by: NKees Cook <keescook@chromium.org>
      6396bb22
  2. 15 5月, 2018 4 次提交
  3. 11 5月, 2018 1 次提交
    • G
      PCI: Add "pci=noats" boot parameter · cef74409
      Gil Kupfer 提交于
      Adds a "pci=noats" boot parameter.  When supplied, all ATS related
      functions fail immediately and the IOMMU is configured to not use
      device-IOTLB.
      
      Any function that checks for ATS capabilities directly against the devices
      should also check this flag.  Currently, such functions exist only in IOMMU
      drivers, and they are covered by this patch.
      
      The motivation behind this patch is the existence of malicious devices.
      Lots of research has been done about how to use the IOMMU as protection
      from such devices.  When ATS is supported, any I/O device can access any
      physical address by faking device-IOTLB entries.  Adding the ability to
      ignore these entries lets sysadmins enhance system security.
      Signed-off-by: NGil Kupfer <gilkup@cs.technion.ac.il>
      Signed-off-by: NBjorn Helgaas <bhelgaas@google.com>
      Acked-by: NJoerg Roedel <jroedel@suse.de>
      cef74409
  4. 29 3月, 2018 1 次提交
  5. 20 3月, 2018 2 次提交
  6. 17 1月, 2018 2 次提交
    • S
      iommu/vt-d: Enable upto 57 bits of domain address width · 5e3b4a15
      Sohil Mehta 提交于
      Update the IOMMU default domain address width to 57 bits. This would
      enable the IOMMU to do upto 5-levels of paging for second level
      translations - IOVA translation requests without PASID.
      
      Even though the maximum supported address width is being increased to
      57, __iommu_calculate_agaw() would set the actual supported address
      width to the maximum support available in IOMMU hardware.
      Signed-off-by: NSohil Mehta <sohil.mehta@intel.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      5e3b4a15
    • P
      iommu/vt-d: Use domain instead of cache fetching · 9d2e6505
      Peter Xu 提交于
      after commit a1ddcbe9 ("iommu/vt-d: Pass dmar_domain directly into
      iommu_flush_iotlb_psi", 2015-08-12), we have domain pointer as parameter
      to iommu_flush_iotlb_psi(), so no need to fetch it from cache again.
      
      More importantly, a NULL reference pointer bug is reported on RHEL7 (and
      it can be reproduced on some old upstream kernels too, e.g., v4.13) by
      unplugging an 40g nic from a VM (hard to test unplug on real host, but
      it should be the same):
      
      https://bugzilla.redhat.com/show_bug.cgi?id=1531367
      
      [   24.391863] pciehp 0000:00:03.0:pcie004: Slot(0): Attention button pressed
      [   24.393442] pciehp 0000:00:03.0:pcie004: Slot(0): Powering off due to button press
      [   29.721068] i40evf 0000:01:00.0: Unable to send opcode 2 to PF, err I40E_ERR_QUEUE_EMPTY, aq_err OK
      [   29.783557] iommu: Removing device 0000:01:00.0 from group 3
      [   29.784662] BUG: unable to handle kernel NULL pointer dereference at 0000000000000304
      [   29.785817] IP: iommu_flush_iotlb_psi+0xcf/0x120
      [   29.786486] PGD 0
      [   29.786487] P4D 0
      [   29.786812]
      [   29.787390] Oops: 0000 [#1] SMP
      [   29.787876] Modules linked in: ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_ng
      [   29.795371] CPU: 0 PID: 156 Comm: kworker/0:2 Not tainted 4.13.0 #14
      [   29.796366] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.11.0-1.el7 04/01/2014
      [   29.797593] Workqueue: pciehp-0 pciehp_power_thread
      [   29.798328] task: ffff94f5745b4a00 task.stack: ffffb326805ac000
      [   29.799178] RIP: 0010:iommu_flush_iotlb_psi+0xcf/0x120
      [   29.799919] RSP: 0018:ffffb326805afbd0 EFLAGS: 00010086
      [   29.800666] RAX: ffff94f5bc56e800 RBX: 0000000000000000 RCX: 0000000200000025
      [   29.801667] RDX: ffff94f5bc56e000 RSI: 0000000000000082 RDI: 0000000000000000
      [   29.802755] RBP: ffffb326805afbf8 R08: 0000000000000000 R09: ffff94f5bc86bbf0
      [   29.803772] R10: ffffb326805afba8 R11: 00000000000ffdc4 R12: ffff94f5bc86a400
      [   29.804789] R13: 0000000000000000 R14: 00000000ffdc4000 R15: 0000000000000000
      [   29.805792] FS:  0000000000000000(0000) GS:ffff94f5bfc00000(0000) knlGS:0000000000000000
      [   29.806923] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      [   29.807736] CR2: 0000000000000304 CR3: 000000003499d000 CR4: 00000000000006f0
      [   29.808747] Call Trace:
      [   29.809156]  flush_unmaps_timeout+0x126/0x1c0
      [   29.809800]  domain_exit+0xd6/0x100
      [   29.810322]  device_notifier+0x6b/0x70
      [   29.810902]  notifier_call_chain+0x4a/0x70
      [   29.812822]  __blocking_notifier_call_chain+0x47/0x60
      [   29.814499]  blocking_notifier_call_chain+0x16/0x20
      [   29.816137]  device_del+0x233/0x320
      [   29.817588]  pci_remove_bus_device+0x6f/0x110
      [   29.819133]  pci_stop_and_remove_bus_device+0x1a/0x20
      [   29.820817]  pciehp_unconfigure_device+0x7a/0x1d0
      [   29.822434]  pciehp_disable_slot+0x52/0xe0
      [   29.823931]  pciehp_power_thread+0x8a/0xa0
      [   29.825411]  process_one_work+0x18c/0x3a0
      [   29.826875]  worker_thread+0x4e/0x3b0
      [   29.828263]  kthread+0x109/0x140
      [   29.829564]  ? process_one_work+0x3a0/0x3a0
      [   29.831081]  ? kthread_park+0x60/0x60
      [   29.832464]  ret_from_fork+0x25/0x30
      [   29.833794] Code: 85 ed 74 0b 5b 41 5c 41 5d 41 5e 41 5f 5d c3 49 8b 54 24 60 44 89 f8 0f b6 c4 48 8b 04 c2 48 85 c0 74 49 45 0f b6 ff 4a 8b 3c f8 <80> bf
      [   29.838514] RIP: iommu_flush_iotlb_psi+0xcf/0x120 RSP: ffffb326805afbd0
      [   29.840362] CR2: 0000000000000304
      [   29.841716] ---[ end trace b10ec0d6900868d3 ]---
      
      This patch fixes that problem if applied to v4.13 kernel.
      
      The bug does not exist on latest upstream kernel since it's fixed as a
      side effect of commit 13cf0174 ("iommu/vt-d: Make use of iova
      deferred flushing", 2017-08-15).  But IMHO it's still good to have this
      patch upstream.
      
      CC: Alex Williamson <alex.williamson@redhat.com>
      Signed-off-by: NPeter Xu <peterx@redhat.com>
      Fixes: a1ddcbe9 ("iommu/vt-d: Pass dmar_domain directly into iommu_flush_iotlb_psi")
      Reviewed-by: NAlex Williamson <alex.williamson@redhat.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      9d2e6505
  7. 15 1月, 2018 1 次提交
  8. 18 11月, 2017 1 次提交
    • R
      iommu/vt-d: Fix scatterlist offset handling · 29a90b70
      Robin Murphy 提交于
      The intel-iommu DMA ops fail to correctly handle scatterlists where
      sg->offset is greater than PAGE_SIZE - the IOVA allocation is computed
      appropriately based on the page-aligned portion of the offset, but the
      mapping is set up relative to sg->page, which means it fails to actually
      cover the whole buffer (and in the worst case doesn't cover it at all):
      
          (sg->dma_address + sg->dma_len) ----+
          sg->dma_address ---------+          |
          iov_pfn------+           |          |
                       |           |          |
                       v           v          v
      iova:   a        b        c        d        e        f
              |--------|--------|--------|--------|--------|
                                <...calculated....>
                       [_____mapped______]
      pfn:    0        1        2        3        4        5
              |--------|--------|--------|--------|--------|
                       ^           ^          ^
                       |           |          |
          sg->page ----+           |          |
          sg->offset --------------+          |
          (sg->offset + sg->length) ----------+
      
      As a result, the caller ends up overrunning the mapping into whatever
      lies beyond, which usually goes badly:
      
      [  429.645492] DMAR: DRHD: handling fault status reg 2
      [  429.650847] DMAR: [DMA Write] Request device [02:00.4] fault addr f2682000 ...
      
      Whilst this is a fairly rare occurrence, it can happen from the result
      of intermediate scatterlist processing such as scatterwalk_ffwd() in the
      crypto layer. Whilst that particular site could be fixed up, it still
      seems worthwhile to bring intel-iommu in line with other DMA API
      implementations in handling this robustly.
      
      To that end, fix the intel_map_sg() path to line up the mapping
      correctly (in units of MM pages rather than VT-d pages to match the
      aligned_nrpages() calculation) regardless of the offset, and use
      sg_phys() consistently for clarity.
      Reported-by: NHarsh Jain <Harsh@chelsio.com>
      Signed-off-by: NRobin Murphy <robin.murphy@arm.com>
      Reviewed by: Ashok Raj <ashok.raj@intel.com>
      Tested by: Jacob Pan <jacob.jun.pan@intel.com>
      Cc: stable@vger.kernel.org
      Signed-off-by: NAlex Williamson <alex.williamson@redhat.com>
      29a90b70
  9. 12 10月, 2017 1 次提交
    • T
      iommu/iova: Make rcache flush optional on IOVA allocation failure · 538d5b33
      Tomasz Nowicki 提交于
      Since IOVA allocation failure is not unusual case we need to flush
      CPUs' rcache in hope we will succeed in next round.
      
      However, it is useful to decide whether we need rcache flush step because
      of two reasons:
      - Not scalability. On large system with ~100 CPUs iterating and flushing
        rcache for each CPU becomes serious bottleneck so we may want to defer it.
      - free_cpu_cached_iovas() does not care about max PFN we are interested in.
        Thus we may flush our rcaches and still get no new IOVA like in the
        commonly used scenario:
      
          if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
              iova = alloc_iova_fast(iovad, iova_len, DMA_BIT_MASK(32) >> shift);
      
          if (!iova)
              iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift);
      
         1. First alloc_iova_fast() call is limited to DMA_BIT_MASK(32) to get
            PCI devices a SAC address
         2. alloc_iova() fails due to full 32-bit space
         3. rcaches contain PFNs out of 32-bit space so free_cpu_cached_iovas()
            throws entries away for nothing and alloc_iova() fails again
         4. Next alloc_iova_fast() call cannot take advantage of rcache since we
            have just defeated caches. In this case we pick the slowest option
            to proceed.
      
      This patch reworks flushed_rcache local flag to be additional function
      argument instead and control rcache flush step. Also, it updates all users
      to do the flush as the last chance.
      Signed-off-by: NTomasz Nowicki <Tomasz.Nowicki@caviumnetworks.com>
      Reviewed-by: NRobin Murphy <robin.murphy@arm.com>
      Tested-by: NNate Watterson <nwatters@codeaurora.org>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      538d5b33
  10. 10 10月, 2017 1 次提交
  11. 06 10月, 2017 1 次提交
  12. 27 9月, 2017 1 次提交
  13. 01 9月, 2017 1 次提交
  14. 31 8月, 2017 1 次提交
    • J
      iommu/vt-d: Prevent VMD child devices from being remapping targets · 5823e330
      Jon Derrick 提交于
      VMD child devices must use the VMD endpoint's ID as the requester.  Because
      of this, there needs to be a way to link the parent VMD endpoint's IOMMU
      group and associated mappings to the VMD child devices such that attaching
      and detaching child devices modify the endpoint's mappings, while
      preventing early detaching on a singular device removal or unbinding.
      
      The reassignment of individual VMD child devices devices to VMs is outside
      the scope of VMD, but may be implemented in the future. For now it is best
      to prevent any such attempts.
      
      Prevent VMD child devices from returning an IOMMU, which prevents it from
      exposing an iommu_group sysfs directory and allowing subsequent binding by
      userspace-access drivers such as VFIO.
      Signed-off-by: NJon Derrick <jonathan.derrick@intel.com>
      Signed-off-by: NBjorn Helgaas <bhelgaas@google.com>
      5823e330
  15. 30 8月, 2017 1 次提交
  16. 16 8月, 2017 1 次提交
  17. 15 8月, 2017 1 次提交
    • J
      iommu: Fix wrong freeing of iommu_device->dev · 2926a2aa
      Joerg Roedel 提交于
      The struct iommu_device has a 'struct device' embedded into
      it, not as a pointer, but the whole struct. In the
      conversion of the iommu drivers to use struct iommu_device
      it was forgotten that the relase function for that struct
      device simply calls kfree() on the pointer.
      
      This frees memory that was never allocated and causes memory
      corruption.
      
      To fix this issue, use a pointer to struct device instead of
      embedding the whole struct. This needs some updates in the
      iommu sysfs code as well as the Intel VT-d and AMD IOMMU
      driver.
      Reported-by: NSebastian Ott <sebott@linux.vnet.ibm.com>
      Fixes: 39ab9555 ('iommu: Add sysfs bindings for struct iommu_device')
      Cc: stable@vger.kernel.org # >= v4.11
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      2926a2aa
  18. 26 7月, 2017 1 次提交
    • D
      iommu/vt-d: Don't free parent pagetable of the PTE we're adding · bc24c571
      David Dillow 提交于
      When adding a large scatterlist entry that covers more than the L3
      superpage size (1GB) but has an alignment such that we must use L2
      superpages (2MB) , we give dma_pte_free_level() a range that causes it
      to free the L3 pagetable we're about to populate. We fix this by telling
      dma_pte_free_pagetable() about the pagetable level we're about to populate
      to prevent freeing it.
      
      For example, mapping a scatterlist with entry lengths 854MB and 1194MB
      at IOVA 0xffff80000000 would, when processing the 2MB-aligned second
      entry, cause pfn_to_dma_pte() to create a L3 directory to hold L2
      superpages for the mapping at IOVA 0xffffc0000000. We would previously
      call dma_pte_free_pagetable(domain, 0xffffc0000, 0xfffffffff), which
      would free the L3 directory pfn_to_dma_pte() just created for IO PFN
      0xffffc0000. Telling dma_pte_free_pagetable() to retain the L3
      directories while using L2 superpages avoids the erroneous free.
      Signed-off-by: NDavid Dillow <dillow@google.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      bc24c571
  19. 28 6月, 2017 3 次提交
    • C
      x86: remove arch specific dma_supported implementation · 5860acc1
      Christoph Hellwig 提交于
      And instead wire it up as method for all the dma_map_ops instances.
      
      Note that this also means the arch specific check will be fully instead
      of partially applied in the AMD iommu driver.
      Signed-off-by: NChristoph Hellwig <hch@lst.de>
      5860acc1
    • A
      iommu/vt-d: Constify intel_dma_ops · 01e1932a
      Arvind Yadav 提交于
      Most dma_map_ops structures are never modified. Constify these
      structures such that these can be write-protected.
      Signed-off-by: NArvind Yadav <arvind.yadav.cs@gmail.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      01e1932a
    • S
      iommu/vt-d: Don't disable preemption while accessing deferred_flush() · 58c4a95f
      Sebastian Andrzej Siewior 提交于
      get_cpu() disables preemption and returns the current CPU number. The
      CPU number is only used once while retrieving the address of the local's
      CPU deferred_flush pointer.
      We can instead use raw_cpu_ptr() while we remain preemptible. The worst
      thing that can happen is that flush_unmaps_timeout() is invoked multiple
      times: once by taskA after seeing HIGH_WATER_MARK and then preempted to
      another CPU and then by taskB which saw HIGH_WATER_MARK on the same CPU
      as taskA. It is also likely that ->size got from HIGH_WATER_MARK to 0
      right after its read because another CPU invoked flush_unmaps_timeout()
      for this CPU.
      The access to flush_data is protected by a spinlock so even if we get
      migrated to another CPU or preempted - the data structure is protected.
      
      While at it, I marked deferred_flush static since I can't find a
      reference to it outside of this file.
      
      Cc: David Woodhouse <dwmw2@infradead.org>
      Cc: Joerg Roedel <joro@8bytes.org>
      Cc: iommu@lists.linux-foundation.org
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NSebastian Andrzej Siewior <bigeasy@linutronix.de>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      58c4a95f
  20. 30 5月, 2017 1 次提交
  21. 23 5月, 2017 1 次提交
  22. 17 5月, 2017 1 次提交
    • K
      iommu/vt-d: Flush the IOTLB to get rid of the initial kdump mappings · f73a7eee
      KarimAllah Ahmed 提交于
      Ever since commit 091d42e4 ("iommu/vt-d: Copy translation tables from
      old kernel") the kdump kernel copies the IOMMU context tables from the
      previous kernel. Each device mappings will be destroyed once the driver
      for the respective device takes over.
      
      This unfortunately breaks the workflow of mapping and unmapping a new
      context to the IOMMU. The mapping function assumes that either:
      
      1) Unmapping did the proper IOMMU flushing and it only ever flush if the
         IOMMU unit supports caching invalid entries.
      2) The system just booted and the initialization code took care of
         flushing all IOMMU caches.
      
      This assumption is not true for the kdump kernel since the context
      tables have been copied from the previous kernel and translations could
      have been cached ever since. So make sure to flush the IOTLB as well
      when we destroy these old copied mappings.
      
      Cc: Joerg Roedel <joro@8bytes.org>
      Cc: David Woodhouse <dwmw2@infradead.org>
      Cc: David Woodhouse <dwmw@amazon.co.uk>
      Cc: Anthony Liguori <aliguori@amazon.com>
      Signed-off-by: NKarimAllah Ahmed <karahmed@amazon.de>
      Acked-by: NDavid Woodhouse <dwmw@amazon.co.uk>
      Cc: stable@vger.kernel.org  v4.2+
      Fixes: 091d42e4 ("iommu/vt-d: Copy translation tables from old kernel")
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      f73a7eee
  23. 27 4月, 2017 1 次提交
    • S
      x86, iommu/vt-d: Add an option to disable Intel IOMMU force on · bfd20f1c
      Shaohua Li 提交于
      IOMMU harms performance signficantly when we run very fast networking
      workloads. It's 40GB networking doing XDP test. Software overhead is
      almost unaware, but it's the IOTLB miss (based on our analysis) which
      kills the performance. We observed the same performance issue even with
      software passthrough (identity mapping), only the hardware passthrough
      survives. The pps with iommu (with software passthrough) is only about
      ~30% of that without it. This is a limitation in hardware based on our
      observation, so we'd like to disable the IOMMU force on, but we do want
      to use TBOOT and we can sacrifice the DMA security bought by IOMMU. I
      must admit I know nothing about TBOOT, but TBOOT guys (cc-ed) think not
      eabling IOMMU is totally ok.
      
      So introduce a new boot option to disable the force on. It's kind of
      silly we need to run into intel_iommu_init even without force on, but we
      need to disable TBOOT PMR registers. For system without the boot option,
      nothing is changed.
      Signed-off-by: NShaohua Li <shli@fb.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      bfd20f1c
  24. 29 3月, 2017 1 次提交
    • J
      iommu/vt-d: Make sure IOMMUs are off when intel_iommu=off · 161b28aa
      Joerg Roedel 提交于
      When booting into a kexec kernel with intel_iommu=off, and
      the previous kernel had intel_iommu=on, the IOMMU hardware
      is still enabled and gets not disabled by the new kernel.
      
      This causes the boot to fail because DMA is blocked by the
      hardware. Disable the IOMMUs when we find it enabled in the
      kexec kernel and boot with intel_iommu=off.
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      161b28aa
  25. 22 3月, 2017 2 次提交
    • R
      iommu: Disambiguate MSI region types · 9d3a4de4
      Robin Murphy 提交于
      The introduction of reserved regions has left a couple of rough edges
      which we could do with sorting out sooner rather than later. Since we
      are not yet addressing the potential dynamic aspect of software-managed
      reservations and presenting them at arbitrary fixed addresses, it is
      incongruous that we end up displaying hardware vs. software-managed MSI
      regions to userspace differently, especially since ARM-based systems may
      actually require one or the other, or even potentially both at once,
      (which iommu-dma currently has no hope of dealing with at all). Let's
      resolve the former user-visible inconsistency ASAP before the ABI has
      been baked into a kernel release, in a way that also lays the groundwork
      for the latter shortcoming to be addressed by follow-up patches.
      
      For clarity, rename the software-managed type to IOMMU_RESV_SW_MSI, use
      IOMMU_RESV_MSI to describe the hardware type, and document everything a
      little bit. Since the x86 MSI remapping hardware falls squarely under
      this meaning of IOMMU_RESV_MSI, apply that type to their regions as well,
      so that we tell the same story to userspace across all platforms.
      
      Secondly, as the various region types require quite different handling,
      and it really makes little sense to ever try combining them, convert the
      bitfield-esque #defines to a plain enum in the process before anyone
      gets the wrong impression.
      
      Fixes: d30ddcaa ("iommu: Add a new type field in iommu_resv_region")
      Reviewed-by: NEric Auger <eric.auger@redhat.com>
      CC: Alex Williamson <alex.williamson@redhat.com>
      CC: David Woodhouse <dwmw2@infradead.org>
      CC: kvm@vger.kernel.org
      Signed-off-by: NRobin Murphy <robin.murphy@arm.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      9d3a4de4
    • K
      iommu/vt-d: Fix NULL pointer dereference in device_to_iommu · 5003ae1e
      Koos Vriezen 提交于
      The function device_to_iommu() in the Intel VT-d driver
      lacks a NULL-ptr check, resulting in this oops at boot on
      some platforms:
      
       BUG: unable to handle kernel NULL pointer dereference at 00000000000007ab
       IP: [<ffffffff8132234a>] device_to_iommu+0x11a/0x1a0
       PGD 0
      
       [...]
      
       Call Trace:
         ? find_or_alloc_domain.constprop.29+0x1a/0x300
         ? dw_dma_probe+0x561/0x580 [dw_dmac_core]
         ? __get_valid_domain_for_dev+0x39/0x120
         ? __intel_map_single+0x138/0x180
         ? intel_alloc_coherent+0xb6/0x120
         ? sst_hsw_dsp_init+0x173/0x420 [snd_soc_sst_haswell_pcm]
         ? mutex_lock+0x9/0x30
         ? kernfs_add_one+0xdb/0x130
         ? devres_add+0x19/0x60
         ? hsw_pcm_dev_probe+0x46/0xd0 [snd_soc_sst_haswell_pcm]
         ? platform_drv_probe+0x30/0x90
         ? driver_probe_device+0x1ed/0x2b0
         ? __driver_attach+0x8f/0xa0
         ? driver_probe_device+0x2b0/0x2b0
         ? bus_for_each_dev+0x55/0x90
         ? bus_add_driver+0x110/0x210
         ? 0xffffffffa11ea000
         ? driver_register+0x52/0xc0
         ? 0xffffffffa11ea000
         ? do_one_initcall+0x32/0x130
         ? free_vmap_area_noflush+0x37/0x70
         ? kmem_cache_alloc+0x88/0xd0
         ? do_init_module+0x51/0x1c4
         ? load_module+0x1ee9/0x2430
         ? show_taint+0x20/0x20
         ? kernel_read_file+0xfd/0x190
         ? SyS_finit_module+0xa3/0xb0
         ? do_syscall_64+0x4a/0xb0
         ? entry_SYSCALL64_slow_path+0x25/0x25
       Code: 78 ff ff ff 4d 85 c0 74 ee 49 8b 5a 10 0f b6 9b e0 00 00 00 41 38 98 e0 00 00 00 77 da 0f b6 eb 49 39 a8 88 00 00 00 72 ce eb 8f <41> f6 82 ab 07 00 00 04 0f 85 76 ff ff ff 0f b6 4d 08 88 0e 49
       RIP  [<ffffffff8132234a>] device_to_iommu+0x11a/0x1a0
        RSP <ffffc90001457a78>
       CR2: 00000000000007ab
       ---[ end trace 16f974b6d58d0aad ]---
      
      Add the missing pointer check.
      
      Fixes: 1c387188 ("iommu/vt-d: Fix IOMMU lookup for SR-IOV Virtual Functions")
      Signed-off-by: NKoos Vriezen <koos.vriezen@gmail.com>
      Cc: stable@vger.kernel.org # 4.8.15+
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      5003ae1e
  26. 28 2月, 2017 1 次提交
  27. 25 2月, 2017 1 次提交
  28. 10 2月, 2017 3 次提交
  29. 31 1月, 2017 2 次提交
    • D
      iommu/vt-d: Don't over-free page table directories · f7116e11
      David Dillow 提交于
      dma_pte_free_level() recurses down the IOMMU page tables and frees
      directory pages that are entirely contained in the given PFN range.
      Unfortunately, it incorrectly calculates the starting address covered
      by the PTE under consideration, which can lead to it clearing an entry
      that is still in use.
      
      This occurs if we have a scatterlist with an entry that has a length
      greater than 1026 MB and is aligned to 2 MB for both the IOMMU and
      physical addresses. For example, if __domain_mapping() is asked to map a
      two-entry scatterlist with 2 MB and 1028 MB segments to PFN 0xffff80000,
      it will ask if dma_pte_free_pagetable() is asked to PFNs from
      0xffff80200 to 0xffffc05ff, it will also incorrectly clear the PFNs from
      0xffff80000 to 0xffff801ff because of this issue. The current code will
      set level_pfn to 0xffff80200, and 0xffff80200-0xffffc01ff fits inside
      the range being cleared. Properly setting the level_pfn for the current
      level under consideration catches that this PTE is outside of the range
      being cleared.
      
      This patch also changes the value passed into dma_pte_free_level() when
      it recurses. This only affects the first PTE of the range being cleared,
      and is handled by the existing code that ensures we start our cursor no
      lower than start_pfn.
      
      This was found when using dma_map_sg() to map large chunks of contiguous
      memory, which immediatedly led to faults on the first access of the
      erroneously-deleted mappings.
      
      Fixes: 3269ee0b ("intel-iommu: Fix leaks in pagetable freeing")
      Reviewed-by: NBenjamin Serebrin <serebrin@google.com>
      Signed-off-by: NDavid Dillow <dillow@google.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      f7116e11
    • A
      iommu/vt-d: Tylersburg isoch identity map check is done too late. · 21e722c4
      Ashok Raj 提交于
      The check to set identity map for tylersburg is done too late. It needs
      to be done before the check for identity_map domain is done.
      
      To: Joerg Roedel <joro@8bytes.org>
      To: David Woodhouse <dwmw2@infradead.org>
      Cc: iommu@lists.linux-foundation.org
      Cc: linux-kernel@vger.kernel.org
      Cc: stable@vger.kernel.org
      Cc: Ashok Raj <ashok.raj@intel.com>
      
      Fixes: 86080ccc ("iommu/vt-d: Allocate si_domain in init_dmars()")
      Signed-off-by: NAshok Raj <ashok.raj@intel.com>
      Reported-by: NYunhong Jiang <yunhong.jiang@intel.com>
      Signed-off-by: NJoerg Roedel <jroedel@suse.de>
      21e722c4