1. 13 6月, 2018 1 次提交
    • K
      treewide: kzalloc_node() -> kcalloc_node() · 590b5b7d
      Kees Cook 提交于
      The kzalloc_node() function has a 2-factor argument form, kcalloc_node(). This
      patch replaces cases of:
      
              kzalloc_node(a * b, gfp, node)
      
      with:
              kcalloc_node(a * b, gfp, node)
      
      as well as handling cases of:
      
              kzalloc_node(a * b * c, gfp, node)
      
      with:
      
              kzalloc_node(array3_size(a, b, c), gfp, node)
      
      as it's slightly less ugly than:
      
              kcalloc_node(array_size(a, b), c, gfp, node)
      
      This does, however, attempt to ignore constant size factors like:
      
              kzalloc_node(4 * 1024, gfp, node)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        kzalloc_node(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        kzalloc_node(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        kzalloc_node(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        kzalloc_node(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * (COUNT_ID)
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * COUNT_ID
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * COUNT_CONST
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * (COUNT_ID)
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * COUNT_ID
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(THING)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * COUNT_CONST
      +	COUNT_CONST, sizeof(THING)
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
      - kzalloc_node
      + kcalloc_node
        (
      -	SIZE * COUNT
      +	COUNT, SIZE
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        kzalloc_node(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        kzalloc_node(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc_node(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        kzalloc_node(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        kzalloc_node(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kzalloc_node(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products,
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc_node(C1 * C2 * C3, ...)
      |
        kzalloc_node(
      -	(E1) * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc_node(
      -	(E1) * (E2) * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc_node(
      -	(E1) * (E2) * (E3)
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kzalloc_node(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants,
      // keeping sizeof() as the second factor argument.
      @@
      expression THING, E1, E2;
      type TYPE;
      constant C1, C2, C3;
      @@
      
      (
        kzalloc_node(sizeof(THING) * C2, ...)
      |
        kzalloc_node(sizeof(TYPE) * C2, ...)
      |
        kzalloc_node(C1 * C2 * C3, ...)
      |
        kzalloc_node(C1 * C2, ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * (E2)
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(TYPE) * E2
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * (E2)
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	sizeof(THING) * E2
      +	E2, sizeof(THING)
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	(E1) * E2
      +	E1, E2
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	(E1) * (E2)
      +	E1, E2
        , ...)
      |
      - kzalloc_node
      + kcalloc_node
        (
      -	E1 * E2
      +	E1, E2
        , ...)
      )
      Signed-off-by: NKees Cook <keescook@chromium.org>
      590b5b7d
  2. 07 6月, 2018 1 次提交
    • K
      treewide: Use struct_size() for kmalloc()-family · acafe7e3
      Kees Cook 提交于
      One of the more common cases of allocation size calculations is finding
      the size of a structure that has a zero-sized array at the end, along
      with memory for some number of elements for that array. For example:
      
      struct foo {
          int stuff;
          void *entry[];
      };
      
      instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL);
      
      Instead of leaving these open-coded and prone to type mistakes, we can
      now use the new struct_size() helper:
      
      instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL);
      
      This patch makes the changes for kmalloc()-family (and kvmalloc()-family)
      uses. It was done via automatic conversion with manual review for the
      "CHECKME" non-standard cases noted below, using the following Coccinelle
      script:
      
      // pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
      //                      sizeof *pkey_cache->table, GFP_KERNEL);
      @@
      identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc";
      expression GFP;
      identifier VAR, ELEMENT;
      expression COUNT;
      @@
      
      - alloc(sizeof(*VAR) + COUNT * sizeof(*VAR->ELEMENT), GFP)
      + alloc(struct_size(VAR, ELEMENT, COUNT), GFP)
      
      // mr = kzalloc(sizeof(*mr) + m * sizeof(mr->map[0]), GFP_KERNEL);
      @@
      identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc";
      expression GFP;
      identifier VAR, ELEMENT;
      expression COUNT;
      @@
      
      - alloc(sizeof(*VAR) + COUNT * sizeof(VAR->ELEMENT[0]), GFP)
      + alloc(struct_size(VAR, ELEMENT, COUNT), GFP)
      
      // Same pattern, but can't trivially locate the trailing element name,
      // or variable name.
      @@
      identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc";
      expression GFP;
      expression SOMETHING, COUNT, ELEMENT;
      @@
      
      - alloc(sizeof(SOMETHING) + COUNT * sizeof(ELEMENT), GFP)
      + alloc(CHECKME_struct_size(&SOMETHING, ELEMENT, COUNT), GFP)
      Signed-off-by: NKees Cook <keescook@chromium.org>
      acafe7e3
  3. 29 5月, 2018 1 次提交
  4. 24 5月, 2018 1 次提交
  5. 17 5月, 2018 1 次提交
  6. 15 5月, 2018 1 次提交
    • B
      IB: Fix RDMA_RXE and INFINIBAND_RDMAVT dependencies for DMA_VIRT_OPS · e02637e9
      Ben Hutchings 提交于
      DMA_VIRT_OPS requires that dma_addr_t is at least as wide as a
      pointer, which is expressed as a dependency on !64BIT ||
      ARCH_DMA_ADDR_T_64BIT.
      
      For parisc64 this is not true, and if these IB modules are enabled,
      kconfig warns:
      
      WARNING: unmet direct dependencies detected for DMA_VIRT_OPS
        Depends on [n]: HAS_DMA [=y] && (!64BIT [=y] || ARCH_DMA_ADDR_T_64BIT)
        Selected by [m]:
        - INFINIBAND_RDMAVT [=m] && INFINIBAND [=m] && 64BIT [=y] && PCI [=y]
        - RDMA_RXE [=m] && INET [=y] && PCI [=y] && INFINIBAND [=m]
      
      Add dependencies to fix this.
      Signed-off-by: NBen Hutchings <ben@decadent.org.uk>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      e02637e9
  7. 10 5月, 2018 3 次提交
  8. 09 5月, 2018 1 次提交
    • A
      nvmet,rxe: defer ip datagram sending to tasklet · 1661d3b0
      Alexandru Moise 提交于
      This addresses 3 separate problems:
      
      1. When using NVME over Fabrics we may end up sending IP
      packets in interrupt context, we should defer this work
      to a tasklet.
      
      [   50.939957] WARNING: CPU: 3 PID: 0 at kernel/softirq.c:161 __local_bh_enable_ip+0x1f/0xa0
      [   50.942602] CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G        W         4.17.0-rc3-ARCH+ #104
      [   50.945466] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
      [   50.948163] RIP: 0010:__local_bh_enable_ip+0x1f/0xa0
      [   50.949631] RSP: 0018:ffff88009c183900 EFLAGS: 00010006
      [   50.951029] RAX: 0000000080010403 RBX: 0000000000000200 RCX: 0000000000000001
      [   50.952636] RDX: 0000000000000000 RSI: 0000000000000200 RDI: ffffffff817e04ec
      [   50.954278] RBP: ffff88009c183910 R08: 0000000000000001 R09: 0000000000000614
      [   50.956000] R10: ffffea00021d5500 R11: 0000000000000001 R12: ffffffff817e04ec
      [   50.957779] R13: 0000000000000000 R14: ffff88009566f400 R15: ffff8800956c7000
      [   50.959402] FS:  0000000000000000(0000) GS:ffff88009c180000(0000) knlGS:0000000000000000
      [   50.961552] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      [   50.963798] CR2: 000055c4ec0ccac0 CR3: 0000000002209001 CR4: 00000000000606e0
      [   50.966121] Call Trace:
      [   50.966845]  <IRQ>
      [   50.967497]  __dev_queue_xmit+0x62d/0x690
      [   50.968722]  dev_queue_xmit+0x10/0x20
      [   50.969894]  neigh_resolve_output+0x173/0x190
      [   50.971244]  ip_finish_output2+0x2b8/0x370
      [   50.972527]  ip_finish_output+0x1d2/0x220
      [   50.973785]  ? ip_finish_output+0x1d2/0x220
      [   50.975010]  ip_output+0xd4/0x100
      [   50.975903]  ip_local_out+0x3b/0x50
      [   50.976823]  rxe_send+0x74/0x120
      [   50.977702]  rxe_requester+0xe3b/0x10b0
      [   50.978881]  ? ip_local_deliver_finish+0xd1/0xe0
      [   50.980260]  rxe_do_task+0x85/0x100
      [   50.981386]  rxe_run_task+0x2f/0x40
      [   50.982470]  rxe_post_send+0x51a/0x550
      [   50.983591]  nvmet_rdma_queue_response+0x10a/0x170
      [   50.985024]  __nvmet_req_complete+0x95/0xa0
      [   50.986287]  nvmet_req_complete+0x15/0x60
      [   50.987469]  nvmet_bio_done+0x2d/0x40
      [   50.988564]  bio_endio+0x12c/0x140
      [   50.989654]  blk_update_request+0x185/0x2a0
      [   50.990947]  blk_mq_end_request+0x1e/0x80
      [   50.991997]  nvme_complete_rq+0x1cc/0x1e0
      [   50.993171]  nvme_pci_complete_rq+0x117/0x120
      [   50.994355]  __blk_mq_complete_request+0x15e/0x180
      [   50.995988]  blk_mq_complete_request+0x6f/0xa0
      [   50.997304]  nvme_process_cq+0xe0/0x1b0
      [   50.998494]  nvme_irq+0x28/0x50
      [   50.999572]  __handle_irq_event_percpu+0xa2/0x1c0
      [   51.000986]  handle_irq_event_percpu+0x32/0x80
      [   51.002356]  handle_irq_event+0x3c/0x60
      [   51.003463]  handle_edge_irq+0x1c9/0x200
      [   51.004473]  handle_irq+0x23/0x30
      [   51.005363]  do_IRQ+0x46/0xd0
      [   51.006182]  common_interrupt+0xf/0xf
      [   51.007129]  </IRQ>
      
      2. Work must always be offloaded to tasklet for rxe_post_send_kernel()
      when using NVMEoF in order to solve lock ordering between neigh->ha_lock
      seqlock and the nvme queue lock:
      
      [   77.833783]  Possible interrupt unsafe locking scenario:
      [   77.833783]
      [   77.835831]        CPU0                    CPU1
      [   77.837129]        ----                    ----
      [   77.838313]   lock(&(&n->ha_lock)->seqcount);
      [   77.839550]                                local_irq_disable();
      [   77.841377]                                lock(&(&nvmeq->q_lock)->rlock);
      [   77.843222]                                lock(&(&n->ha_lock)->seqcount);
      [   77.845178]   <Interrupt>
      [   77.846298]     lock(&(&nvmeq->q_lock)->rlock);
      [   77.847986]
      [   77.847986]  *** DEADLOCK ***
      
      3. Same goes for the lock ordering between sch->q.lock and nvme queue lock:
      
      [   47.634271]  Possible interrupt unsafe locking scenario:
      [   47.634271]
      [   47.636452]        CPU0                    CPU1
      [   47.637861]        ----                    ----
      [   47.639285]   lock(&(&sch->q.lock)->rlock);
      [   47.640654]                                local_irq_disable();
      [   47.642451]                                lock(&(&nvmeq->q_lock)->rlock);
      [   47.644521]                                lock(&(&sch->q.lock)->rlock);
      [   47.646480]   <Interrupt>
      [   47.647263]     lock(&(&nvmeq->q_lock)->rlock);
      [   47.648492]
      [   47.648492]  *** DEADLOCK ***
      
      Using NVMEoF after this patch seems to finally be stable, without it,
      rxe eventually deadlocks the whole system and causes RCU stalls.
      Signed-off-by: NAlexandru Moise <00moses.alexander00@gmail.com>
      Reviewed-by: NZhu Yanjun <yanjun.zhu@oracle.com>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      1661d3b0
  9. 01 5月, 2018 1 次提交
  10. 28 4月, 2018 5 次提交
  11. 20 4月, 2018 3 次提交
  12. 18 4月, 2018 2 次提交
  13. 06 4月, 2018 2 次提交
    • M
      IB/rxe: Fix for oops in rxe_register_device on ppc64le arch · efc365e7
      Mikhail Malygin 提交于
      On ppc64le arch rxe_add command causes oops in kernel log:
      
      [   92.495140] Oops: Kernel access of bad area, sig: 11 [#1]
      [   92.499710] SMP NR_CPUS=2048 NUMA pSeries
      [   92.499792] Modules linked in: ipt_MASQUERADE(E) nf_nat_masquerade_ipv4(E) nf_conntrack_netlink(E) nfnetlink(E) xfrm_user(E) iptable
      _nat(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) nf_nat_ipv4(E) xt_addrtype(E) iptable_filter(E) ip_tables(E) xt_conntrack(E) x_tables(E)
       nf_nat(E) nf_conntrack(E) br_netfilter(E) bridge(E) stp(E) llc(E) overlay(E) af_packet(E) rpcrdma(E) ib_isert(E) iscsi_target_mod(E) i
      b_iser(E) libiscsi(E) ib_srpt(E) target_core_mod(E) ib_srp(E) ib_ipoib(E) rdma_ucm(E) ib_ucm(E) ib_uverbs(E) ib_umad(E) bochs_drm(E) tt
      m(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) drm(E) agpgart(E) virtio_rng(E) virtio_console(E) rtc_
      generic(E) dm_ec(OEN) ttln_rdma(OEN) rdma_cm(E) configfs(E) iw_cm(E) ib_cm(E) rdma_rxe(E) ip6_udp_tunnel(E) udp_tunnel(E) ib_core(E) ql
      a2xxx(E)
      [   92.499832]  scsi_transport_fc(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) ipmi_watchdog(E) ipmi_ssif(E) ipmi_poweroff(E) ipmi_powernv(EX) ipmi_devintf(E) ipmi_msghandler(E) dummy(E) ext4(E) crc16(E) jbd2(E) mbcache(E) dm_service_time(E) scsi_transport_iscsi(E) sd_mod(E) sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) virtio_blk(E) virtio_scsi(E) virtio_net(E) ibmvscsi(EX) scsi_transport_srp(E) xhci_pci(E) xhci_hcd(E) usbcore(E) usb_common(E) virtio_pci(E) virtio_ring(E) virtio(E) sunrpc(E) dm_mirror(E) dm_region_hash(E) dm_log(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) autofs4(E)
      [   92.499834] Supported: No, Unsupported modules are loaded
      [   92.499839] CPU: 3 PID: 5576 Comm: sh Tainted: G           OE   NX 4.4.120-ttln.17-default #1
      [   92.499841] task: c0000000afe8a490 ti: c0000000beba8000 task.ti: c0000000beba8000
      [   92.499842] NIP: c00000000008ba3c LR: c000000000027644 CTR: c00000000008ba10
      [   92.499844] REGS: c0000000bebab750 TRAP: 0300   Tainted: G           OE   NX  (4.4.120-ttln.17-default)
      [   92.499850] MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 28424428  XER: 20000000
      [   92.499871] CFAR: 0000000000002424 DAR: 0000000000000208 DSISR: 40000000 SOFTE: 1
                     GPR00: c000000000027644 c0000000bebab9d0 c000000000f09700 0000000000000000
                     GPR04: d0000000043d7192 0000000000000002 000000000000001a fffffffffffffffe
                     GPR08: 000000000000009c c00000000008ba10 d0000000043e5848 d0000000043d3828
                     GPR12: c00000000008ba10 c000000007a02400 0000000010062e38 0000010020388860
                     GPR16: 0000000000000000 0000000000000000 00000100203885f0 00000000100f6c98
                     GPR20: c0000000b3f1fcc0 c0000000b3f1fc48 c0000000b3f1fbd0 c0000000b3f1fb58
                     GPR24: c0000000b3f1fae0 c0000000b3f1fa68 00000000000005dc c0000000b3f1f9f0
                     GPR28: d0000000043e5848 c0000000b3f1f900 c0000000b3f1f320 c0000000b3f1f000
      [   92.499881] NIP [c00000000008ba3c] dma_get_required_mask_pSeriesLP+0x2c/0x1a0
      [   92.499885] LR [c000000000027644] dma_get_required_mask+0x44/0xac
      [   92.499886] Call Trace:
      [   92.499891] [c0000000bebab9d0] [c0000000bebaba30] 0xc0000000bebaba30 (unreliable)
      [   92.499894] [c0000000bebaba10] [c000000000027644] dma_get_required_mask+0x44/0xac
      [   92.499904] [c0000000bebaba30] [d0000000043cb4b4] rxe_register_device+0xc4/0x430 [rdma_rxe]
      [   92.499910] [c0000000bebabab0] [d0000000043c06c8] rxe_add+0x448/0x4e0 [rdma_rxe]
      [   92.499915] [c0000000bebabb30] [d0000000043d28dc] rxe_net_add+0x4c/0xf0 [rdma_rxe]
      [   92.499921] [c0000000bebabb60] [d0000000043d305c] rxe_param_set_add+0x6c/0x1ac [rdma_rxe]
      [   92.499924] [c0000000bebabbf0] [c0000000000e78c0] param_attr_store+0xa0/0x180
      [   92.499927] [c0000000bebabc70] [c0000000000e6448] module_attr_store+0x48/0x70
      [   92.499932] [c0000000bebabc90] [c000000000391f60] sysfs_kf_write+0x70/0xb0
      [   92.499935] [c0000000bebabcb0] [c000000000390f1c] kernfs_fop_write+0x18c/0x1e0
      [   92.499939] [c0000000bebabd00] [c0000000002e22ac] __vfs_write+0x4c/0x1d0
      [   92.499942] [c0000000bebabd90] [c0000000002e2f94] vfs_write+0xc4/0x200
      [   92.499945] [c0000000bebabde0] [c0000000002e488c] SyS_write+0x6c/0x110
      [   92.499948] [c0000000bebabe30] [c000000000009384] system_call+0x38/0xe4
      [   92.499949] Instruction dump:
      [   92.499954] 4e800020 3c4c00e8 3842dcf0 7c0802a6 f8010010 60000000 7c0802a6 fba1ffe8
      [   92.499958] fbc1fff0 fbe1fff8 f8010010 f821ffc1 <e9230208> 7c7e1b78 2fa90000 419e0078
      [   92.499962] ---[ end trace bed077e15eb420cf ]---
      
      It fails in dma_get_required_mask, that has ppc-specific implementation,
      and fail if provided device argument is NULL
      Signed-off-by: NMikhail Malygin <mikhail@malygin.me>
      Reviewed-by: NYonatan Cohen <yonatanc@mellanox.com>
      Signed-off-by: NJason Gunthorpe <jgg@mellanox.com>
      efc365e7
    • P
      IB/rxe: Removed GID add/del dummy routines · 39e00b6c
      Parav Pandit 提交于
      rxe driver's add_gid() and del_gid() callbacks are doing simple
      checks which are already done by the ib core before invoking these
      callback routines.
      Therefore, code is simplified to skip implementing add_gid() and
      del_gid() callback functions.
      They are only invoked by ib_core if they are implemented.
      Signed-off-by: NParav Pandit <parav@mellanox.com>
      Reviewed-by: NLeon Romanovsky <leonro@mellanox.com>
      Signed-off-by: NJason Gunthorpe <jgg@mellanox.com>
      39e00b6c
  14. 04 4月, 2018 3 次提交
  15. 30 3月, 2018 1 次提交
  16. 28 3月, 2018 1 次提交
    • J
      RDMA/rxe: Fix uABI structure layouts for 32/64 compat · f2e9bfac
      Jason Gunthorpe 提交于
      With 32 bit compilation several of the fields become misaligned here.
      Fixing this is an ABI break for 32 bit rxe and it is in well used
      portions of the rxe ABI.
      
      To handle this we bump the ABI version, as expected. However the user
      space driver doesn't handle it properly today, so all existing user
      space continues to work.
      
      Updated userspace will start to require the necessary kernel version.
      
      We don't expect there to be any 32 bit users of rxe. Most likely cases,
      such as ARM 32 already generally don't work because rxe does not handle
      the CPU cache properly on its shared with userspace pages.
      Signed-off-by: NJason Gunthorpe <jgg@mellanox.com>
      f2e9bfac
  17. 20 3月, 2018 1 次提交
  18. 16 3月, 2018 3 次提交
  19. 15 3月, 2018 2 次提交
    • M
      rdma_rxe: make rxe work over 802.1q VLAN devices · 43c9fc50
      Martin Wilck 提交于
      This patch fixes RDMA/rxe over 802.1q VLAN devices.
      
      Without it, I observed the following behavior:
      
      a) adding a VLAN device to RXE via rxe_net_add() creates a non-functional
         RDMA device. This is caused by the logic in enum_all_gids_of_dev_cb() /
         is_eth_port_of_netdev(), which only considers networks connected to
         "upper devices" of the configured network device, resulting in an empty
         set of gids for a VLAN interface that is an "upper device" itself.
         Later attempts to connect via this rdma device fail in cma_acuire_dev()
         because no gids can be resolved.
      
      b) adding the master device of the VLAN device instead seems to work
         initially, target addresses via VLAN devices are resolved successfully.
         But the connection times out because no 802.1q VLAN headers are
         inserted in the ethernet packets, which are therefore never received.
         This happens because the RXE layer sends the packets via the master
         device rather than the VLAN device.
      
      The problem could be solved by changing either a) or b). My thinking was
      that the logic in a) was created deliberately, thus I decided to work on
      b). It turns out that the information about the VLAN interface for the gid
      at hand is available in the AV information. My patch converts the RXE code
      to use this netdev instead of rxe->ndev. With this change, RXE over vlan
      works on my test system.
      Signed-off-by: NMartin Wilck <mwilck@suse.com>
      Reviewed-by: NMoni Shoua <monis@mellanox.com>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      43c9fc50
    • T
      RDMAVT: Fix synchronization around percpu_ref · 74b44bbe
      Tejun Heo 提交于
      rvt_mregion uses percpu_ref for reference counting and RCU to protect
      accesses from lkey_table.  When a rvt_mregion needs to be freed, it
      first gets unregistered from lkey_table and then rvt_check_refs() is
      called to wait for in-flight usages before the rvt_mregion is freed.
      
      rvt_check_refs() seems to have a couple issues.
      
      * It has a fast exit path which tests percpu_ref_is_zero().  However,
        a percpu_ref reading zero doesn't mean that the object can be
        released.  In fact, the ->release() callback might not even have
        started executing yet.  Proceeding with freeing can lead to
        use-after-free.
      
      * lkey_table is RCU protected but there is no RCU grace period in the
        free path.  percpu_ref uses RCU internally but it's sched-RCU whose
        grace periods are different from regular RCU.  Also, it generally
        isn't a good idea to depend on internal behaviors like this.
      
      To address the above issues, this patch removes the fast exit and adds
      an explicit synchronize_rcu().
      Signed-off-by: NTejun Heo <tj@kernel.org>
      Acked-by: NDennis Dalessandro <dennis.dalessandro@intel.com>
      Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
      Cc: linux-rdma@vger.kernel.org
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      74b44bbe
  20. 14 3月, 2018 1 次提交
  21. 08 3月, 2018 3 次提交
    • Z
      IB/rxe: change the function rxe_init_device_param type · befd8d98
      Zhu Yanjun 提交于
      The function rxe_init_device_param always return 0. So the function
      type is changed to void.
      
      CC: Srinivas Eeda <srinivas.eeda@oracle.com>
      CC: Junxiao Bi <junxiao.bi@oracle.com>
      Signed-off-by: NZhu Yanjun <yanjun.zhu@oracle.com>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      befd8d98
    • Z
      IB/rxe: remove unnecessary rxe in rxe_send · 31f1bd14
      Zhu Yanjun 提交于
      In the function rxe_send, the variable rxe is not used in it.
      So it should be removed.
      
      CC: Srinivas Eeda <srinivas.eeda@oracle.com>
      CC: Junxiao Bi <junxiao.bi@oracle.com>
      Signed-off-by: NZhu Yanjun <yanjun.zhu@oracle.com>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      31f1bd14
    • Z
      IB/rxe: remove unnecessary skb_clone · 86af6176
      Zhu Yanjun 提交于
      In send_atomic_ack function, it is not necessary to make a
      skb_clone. To gain better performance (high throughput and
      low latency), this skb_clone is removed.
      
      The following tests are made.
      
       server                       client
      ---------                    ---------
      |1.1.1.1|<----rxe-channel--->|1.1.1.2|
      ---------                    ---------
      
      On server: rping -s -a 1.1.1.1 -v -C 1000 -S 512
      On client: rping -c -a 1.1.1.1 -v -C 1000 -S 512
      
      The kernel config CONFIG_DEBUG_KMEMLEAK is enabled on both server
      and client.
      
      This test runs for several hours. There is no memory leak and the whole
      system can work well.
      
      Based on the above network, the following tests are made.
      
      Server: ibv_rc_pingpong -d rxe0 -g 1
      Client: ibv_rc_pingpong -d rxe0 -g 1 1.1.1.1
      
      The test results on Server(10 tests are made).
      Before:
      Throughput is 137.07 Mbit/sec
      Latency is 517.76 usec/iter
      
      After:
      Throughput is 148.85 Mbit/sec
      Latency is 476.64 usec/iter
      
      The throughput is enhanced and the latency is reduced.
      
      CC: Srinivas Eeda <srinivas.eeda@oracle.com>
      CC: Junxiao Bi <junxiao.bi@oracle.com>
      Signed-off-by: NZhu Yanjun <yanjun.zhu@oracle.com>
      Signed-off-by: NDoug Ledford <dledford@redhat.com>
      86af6176
  22. 07 3月, 2018 2 次提交
    • B
      RDMA/rxe: Fix an out-of-bounds read · a6544a62
      Bart Van Assche 提交于
      This patch avoids that KASAN reports the following when the SRP initiator
      calls srp_post_send():
      
      ==================================================================
      BUG: KASAN: stack-out-of-bounds in rxe_post_send+0x5c4/0x980 [rdma_rxe]
      Read of size 8 at addr ffff880066606e30 by task 02-mq/1074
      
      CPU: 2 PID: 1074 Comm: 02-mq Not tainted 4.16.0-rc3-dbg+ #1
      Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
      Call Trace:
      dump_stack+0x85/0xc7
      print_address_description+0x65/0x270
      kasan_report+0x231/0x350
      rxe_post_send+0x5c4/0x980 [rdma_rxe]
      srp_post_send.isra.16+0x149/0x190 [ib_srp]
      srp_queuecommand+0x94d/0x1670 [ib_srp]
      scsi_dispatch_cmd+0x1c2/0x550 [scsi_mod]
      scsi_queue_rq+0x843/0xa70 [scsi_mod]
      blk_mq_dispatch_rq_list+0x143/0xac0
      blk_mq_do_dispatch_ctx+0x1c5/0x260
      blk_mq_sched_dispatch_requests+0x2bf/0x2f0
      __blk_mq_run_hw_queue+0xdb/0x160
      __blk_mq_delay_run_hw_queue+0xba/0x100
      blk_mq_run_hw_queue+0xf2/0x190
      blk_mq_sched_insert_request+0x163/0x2f0
      blk_execute_rq+0xb0/0x130
      scsi_execute+0x14e/0x260 [scsi_mod]
      scsi_probe_and_add_lun+0x366/0x13d0 [scsi_mod]
      __scsi_scan_target+0x18a/0x810 [scsi_mod]
      scsi_scan_target+0x11e/0x130 [scsi_mod]
      srp_create_target+0x1522/0x19e0 [ib_srp]
      kernfs_fop_write+0x180/0x210
      __vfs_write+0xb1/0x2e0
      vfs_write+0xf6/0x250
      SyS_write+0x99/0x110
      do_syscall_64+0xee/0x2b0
      entry_SYSCALL_64_after_hwframe+0x42/0xb7
      
      The buggy address belongs to the page:
      page:ffffea0001998180 count:0 mapcount:0 mapping:0000000000000000 index:0x0
      flags: 0x4000000000000000()
      raw: 4000000000000000 0000000000000000 0000000000000000 00000000ffffffff
      raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000
      page dumped because: kasan: bad access detected
      
      Memory state around the buggy address:
      ffff880066606d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1
      ffff880066606d80: f1 00 f2 f2 f2 f2 f2 f2 f2 00 00 f2 f2 f2 f2 f2
      >ffff880066606e00: f2 00 00 00 00 00 f2 f2 f2 f3 f3 f3 f3 00 00 00
                                          ^
      ffff880066606e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
      ffff880066606f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
      ==================================================================
      
      Fixes: 8700e3e7 ("Soft RoCE driver")
      Signed-off-by: NBart Van Assche <bart.vanassche@wdc.com>
      Cc: Moni Shoua <monis@mellanox.com>
      Cc: stable@vger.kernel.org
      Signed-off-by: NJason Gunthorpe <jgg@mellanox.com>
      a6544a62
    • C
      infiniband: remove redundant assignment to pointer 'rdi' · 042932f7
      Colin Ian King 提交于
      The pointer rdi is being initialized with a value that is never read
      and re-assigned immediately after, hence the initialization is redundant
      and can be removed.
      
      Cleans up clang warning:
      drivers/infiniband/sw/rdmavt/vt.c:94:23: warning: Value stored to 'rdi'
      during its initialization is never read
      Signed-off-by: NColin Ian King <colin.king@canonical.com>
      Signed-off-by: NJason Gunthorpe <jgg@mellanox.com>
      042932f7