1. 23 7月, 2018 3 次提交
  2. 04 7月, 2018 4 次提交
  3. 23 6月, 2018 1 次提交
  4. 13 6月, 2018 1 次提交
    • K
      treewide: kmalloc() -> kmalloc_array() · 6da2ec56
      Kees Cook 提交于
      The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
      patch replaces cases of:
      
              kmalloc(a * b, gfp)
      
      with:
              kmalloc_array(a * b, gfp)
      
      as well as handling cases of:
      
              kmalloc(a * b * c, gfp)
      
      with:
      
              kmalloc(array3_size(a, b, c), gfp)
      
      as it's slightly less ugly than:
      
              kmalloc_array(array_size(a, b), c, gfp)
      
      This does, however, attempt to ignore constant size factors like:
      
              kmalloc(4 * 1024, gfp)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The tools/ directory was manually excluded, since it has its own
      implementation of kmalloc().
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        kmalloc(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        kmalloc(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        kmalloc(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (COUNT_ID)
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * COUNT_ID
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * COUNT_CONST
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (COUNT_ID)
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * COUNT_ID
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * COUNT_CONST
      +	COUNT_CONST, sizeof(THING)
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
      - kmalloc
      + kmalloc_array
        (
      -	SIZE * COUNT
      +	COUNT, SIZE
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        kmalloc(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        kmalloc(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kmalloc(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        kmalloc(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products,
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        kmalloc(C1 * C2 * C3, ...)
      |
        kmalloc(
      -	(E1) * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	(E1) * (E2) * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	(E1) * (E2) * (E3)
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants,
      // keeping sizeof() as the second factor argument.
      @@
      expression THING, E1, E2;
      type TYPE;
      constant C1, C2, C3;
      @@
      
      (
        kmalloc(sizeof(THING) * C2, ...)
      |
        kmalloc(sizeof(TYPE) * C2, ...)
      |
        kmalloc(C1 * C2 * C3, ...)
      |
        kmalloc(C1 * C2, ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (E2)
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * E2
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (E2)
      +	E2, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * E2
      +	E2, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	(E1) * E2
      +	E1, E2
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	(E1) * (E2)
      +	E1, E2
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	E1 * E2
      +	E1, E2
        , ...)
      )
      Signed-off-by: NKees Cook <keescook@chromium.org>
      6da2ec56
  5. 31 5月, 2018 1 次提交
  6. 24 4月, 2018 1 次提交
    • P
      vhost_net: use packet weight for rx handler, too · db688c24
      Paolo Abeni 提交于
      Similar to commit a2ac9990 ("vhost-net: set packet weight of
      tx polling to 2 * vq size"), we need a packet-based limit for
      handler_rx, too - elsewhere, under rx flood with small packets,
      tx can be delayed for a very long time, even without busypolling.
      
      The pkt limit applied to handle_rx must be the same applied by
      handle_tx, or we will get unfair scheduling between rx and tx.
      Tying such limit to the queue length makes it less effective for
      large queue length values and can introduce large process
      scheduler latencies, so a constant valued is used - likewise
      the existing bytes limit.
      
      The selected limit has been validated with PVP[1] performance
      test with different queue sizes:
      
      queue size		256	512	1024
      
      baseline		366	354	362
      weight 128		715	723	670
      weight 256		740	745	733
      weight 512		600	460	583
      weight 1024		423	427	418
      
      A packet weight of 256 gives peek performances in under all the
      tested scenarios.
      
      No measurable regression in unidirectional performance tests has
      been detected.
      
      [1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/Signed-off-by: NPaolo Abeni <pabeni@redhat.com>
      Acked-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      db688c24
  7. 17 4月, 2018 1 次提交
  8. 09 4月, 2018 1 次提交
    • H
      vhost-net: set packet weight of tx polling to 2 * vq size · a2ac9990
      haibinzhang(张海斌) 提交于
      handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
      polling udp packets with small length(e.g. 1byte udp payload), because setting
      VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
      
      Ping-Latencies shown below were tested between two Virtual Machines using
      netperf (UDP_STREAM, len=1), and then another machine pinged the client:
      
      vq size=256
      Packet-Weight   Ping-Latencies(millisecond)
                         min      avg       max
      Origin           3.319   18.489    57.303
      64               1.643    2.021     2.552
      128              1.825    2.600     3.224
      256              1.997    2.710     4.295
      512              1.860    3.171     4.631
      1024             2.002    4.173     9.056
      2048             2.257    5.650     9.688
      4096             2.093    8.508    15.943
      
      vq size=512
      Packet-Weight   Ping-Latencies(millisecond)
                         min      avg       max
      Origin           6.537   29.177    66.245
      64               2.798    3.614     4.403
      128              2.861    3.820     4.775
      256              3.008    4.018     4.807
      512              3.254    4.523     5.824
      1024             3.079    5.335     7.747
      2048             3.944    8.201    12.762
      4096             4.158   11.057    19.985
      
      Seems pretty consistent, a small dip at 2 VQ sizes.
      Ring size is a hint from device about a burst size it can tolerate. Based on
      benchmarks, set the weight to 2 * vq size.
      
      To evaluate this change, another tests were done using netperf(RR, TX) between
      two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
      tweaked through qemu. Results shown below does not show obvious changes.
      
      vq size=256 TCP_RR                vq size=512 TCP_RR
      size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
         1/       1/  -7%/        -2%      1/       1/   0%/        -2%
         1/       4/  +1%/         0%      1/       4/  +1%/         0%
         1/       8/  +1%/        -2%      1/       8/   0%/        +1%
        64/       1/  -6%/         0%     64/       1/  +7%/        +3%
        64/       4/   0%/        +2%     64/       4/  -1%/        +1%
        64/       8/   0%/         0%     64/       8/  -1%/        -2%
       256/       1/  -3%/        -4%    256/       1/  -4%/        -2%
       256/       4/  +3%/        +4%    256/       4/  +1%/        +2%
       256/       8/  +2%/         0%    256/       8/  +1%/        -1%
      
      vq size=256 UDP_RR                vq size=512 UDP_RR
      size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
         1/       1/  -5%/        +1%      1/       1/  -3%/        -2%
         1/       4/  +4%/        +1%      1/       4/  -2%/        +2%
         1/       8/  -1%/        -1%      1/       8/  -1%/         0%
        64/       1/  -2%/        -3%     64/       1/  +1%/        +1%
        64/       4/  -5%/        -1%     64/       4/  +2%/         0%
        64/       8/   0%/        -1%     64/       8/  -2%/        +1%
       256/       1/  +7%/        +1%    256/       1/  -7%/         0%
       256/       4/  +1%/        +1%    256/       4/  -3%/        -4%
       256/       8/  +2%/        +2%    256/       8/  +1%/        +1%
      
      vq size=256 TCP_STREAM            vq size=512 TCP_STREAM
      size/sessions/+thu%/+normalize%   size/sessions/+thu%/+normalize%
        64/       1/   0%/        -3%     64/       1/   0%/         0%
        64/       4/  +3%/        -1%     64/       4/  -2%/        +4%
        64/       8/  +9%/        -4%     64/       8/  -1%/        +2%
       256/       1/  +1%/        -4%    256/       1/  +1%/        +1%
       256/       4/  -1%/        -1%    256/       4/  -3%/         0%
       256/       8/  +7%/        +5%    256/       8/  -3%/         0%
       512/       1/  +1%/         0%    512/       1/  -1%/        -1%
       512/       4/  +1%/        -1%    512/       4/   0%/         0%
       512/       8/  +7%/        -5%    512/       8/  +6%/        -1%
      1024/       1/   0%/        -1%   1024/       1/   0%/        +1%
      1024/       4/  +3%/         0%   1024/       4/  +1%/         0%
      1024/       8/  +8%/        +5%   1024/       8/  -1%/         0%
      2048/       1/  +2%/        +2%   2048/       1/  -1%/         0%
      2048/       4/  +1%/         0%   2048/       4/   0%/        -1%
      2048/       8/  -2%/         0%   2048/       8/   5%/        -1%
      4096/       1/  -2%/         0%   4096/       1/  -2%/         0%
      4096/       4/  +2%/         0%   4096/       4/   0%/         0%
      4096/       8/  +9%/        -2%   4096/       8/  -5%/        -1%
      Acked-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NHaibin Zhang <haibinzhang@tencent.com>
      Signed-off-by: NYunfang Tai <yunfangtai@tencent.com>
      Signed-off-by: NLidong Chen <lidongchen@tencent.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      a2ac9990
  9. 27 3月, 2018 1 次提交
  10. 10 3月, 2018 3 次提交
    • J
      vhost_net: examine pointer types during un-producing · 3a403076
      Jason Wang 提交于
      After commit fc72d1d5 ("tuntap: XDP transmission"), we can
      actually queueing XDP pointers in the pointer ring, so we should
      examine the pointer type before freeing the pointer.
      
      Fixes: fc72d1d5 ("tuntap: XDP transmission")
      Reported-by: NMichael S. Tsirkin <mst@redhat.com>
      Acked-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      3a403076
    • J
      vhost_net: keep private_data and rx_ring synced · 303fd71b
      Jason Wang 提交于
      We get pointer ring from the exported sock, this means we should keep
      rx_ring and vq->private synced during both vq stop and backend set,
      otherwise we may see stale rx_ring.
      
      Fixes: c67df11f ("vhost_net: try batch dequing from skb array")
      Signed-off-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Acked-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      303fd71b
    • A
      vhost_net: initialize rx_ring in vhost_net_open() · ab7e34b3
      Alexander Potapenko 提交于
      KMSAN reported a use of uninit memory in vhost_net_buf_unproduce()
      while trying to access n->vqs[VHOST_NET_VQ_TX].rx_ring:
      
      ==================================================================
      BUG: KMSAN: use of uninitialized memory in vhost_net_buf_unproduce+0x7bb/0x9a0 drivers/vho
      et.c:170
      CPU: 0 PID: 3021 Comm: syz-fuzzer Not tainted 4.16.0-rc4+ #3853
      Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
      Call Trace:
       __dump_stack lib/dump_stack.c:17 [inline]
       dump_stack+0x185/0x1d0 lib/dump_stack.c:53
       kmsan_report+0x142/0x1f0 mm/kmsan/kmsan.c:1093
       __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676
       vhost_net_buf_unproduce+0x7bb/0x9a0 drivers/vhost/net.c:170
       vhost_net_stop_vq drivers/vhost/net.c:974 [inline]
       vhost_net_stop+0x146/0x380 drivers/vhost/net.c:982
       vhost_net_release+0xb1/0x4f0 drivers/vhost/net.c:1015
       __fput+0x49f/0xa00 fs/file_table.c:209
       ____fput+0x37/0x40 fs/file_table.c:243
       task_work_run+0x243/0x2c0 kernel/task_work.c:113
       tracehook_notify_resume include/linux/tracehook.h:191 [inline]
       exit_to_usermode_loop arch/x86/entry/common.c:166 [inline]
       prepare_exit_to_usermode+0x349/0x3b0 arch/x86/entry/common.c:196
       syscall_return_slowpath+0xf3/0x6d0 arch/x86/entry/common.c:265
       do_syscall_64+0x34d/0x450 arch/x86/entry/common.c:292
      ...
      origin:
       kmsan_save_stack_with_flags mm/kmsan/kmsan.c:303 [inline]
       kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:213
       kmsan_kmalloc_large+0x6f/0xd0 mm/kmsan/kmsan.c:392
       kmalloc_large_node_hook mm/slub.c:1366 [inline]
       kmalloc_large_node mm/slub.c:3808 [inline]
       __kmalloc_node+0x100e/0x1290 mm/slub.c:3818
       kmalloc_node include/linux/slab.h:554 [inline]
       kvmalloc_node+0x1a5/0x2e0 mm/util.c:419
       kvmalloc include/linux/mm.h:541 [inline]
       vhost_net_open+0x64/0x5f0 drivers/vhost/net.c:921
       misc_open+0x7b5/0x8b0 drivers/char/misc.c:154
       chrdev_open+0xc28/0xd90 fs/char_dev.c:417
       do_dentry_open+0xccb/0x1430 fs/open.c:752
       vfs_open+0x272/0x2e0 fs/open.c:866
       do_last fs/namei.c:3378 [inline]
       path_openat+0x49ad/0x6580 fs/namei.c:3519
       do_filp_open+0x267/0x640 fs/namei.c:3553
       do_sys_open+0x6ad/0x9c0 fs/open.c:1059
       SYSC_openat+0xc7/0xe0 fs/open.c:1086
       SyS_openat+0x63/0x90 fs/open.c:1080
       do_syscall_64+0x2f1/0x450 arch/x86/entry/common.c:287
      ==================================================================
      
      Fixes: c67df11f ("vhost_net: try batch dequing from skb array")
      Signed-off-by: NAlexander Potapenko <glider@google.com>
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Acked-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      ab7e34b3
  11. 13 2月, 2018 1 次提交
    • D
      net: make getname() functions return length rather than use int* parameter · 9b2c45d4
      Denys Vlasenko 提交于
      Changes since v1:
      Added changes in these files:
          drivers/infiniband/hw/usnic/usnic_transport.c
          drivers/staging/lustre/lnet/lnet/lib-socket.c
          drivers/target/iscsi/iscsi_target_login.c
          drivers/vhost/net.c
          fs/dlm/lowcomms.c
          fs/ocfs2/cluster/tcp.c
          security/tomoyo/network.c
      
      Before:
      All these functions either return a negative error indicator,
      or store length of sockaddr into "int *socklen" parameter
      and return zero on success.
      
      "int *socklen" parameter is awkward. For example, if caller does not
      care, it still needs to provide on-stack storage for the value
      it does not need.
      
      None of the many FOO_getname() functions of various protocols
      ever used old value of *socklen. They always just overwrite it.
      
      This change drops this parameter, and makes all these functions, on success,
      return length of sockaddr. It's always >= 0 and can be differentiated
      from an error.
      
      Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.
      
      rpc_sockname() lost "int buflen" parameter, since its only use was
      to be passed to kernel_getsockname() as &buflen and subsequently
      not used in any way.
      
      Userspace API is not changed.
      
          text    data     bss      dec     hex filename
      30108430 2633624  873672 33615726 200ef6e vmlinux.before.o
      30108109 2633612  873672 33615393 200ee21 vmlinux.o
      Signed-off-by: NDenys Vlasenko <dvlasenk@redhat.com>
      CC: David S. Miller <davem@davemloft.net>
      CC: linux-kernel@vger.kernel.org
      CC: netdev@vger.kernel.org
      CC: linux-bluetooth@vger.kernel.org
      CC: linux-decnet-user@lists.sourceforge.net
      CC: linux-wireless@vger.kernel.org
      CC: linux-rdma@vger.kernel.org
      CC: linux-sctp@vger.kernel.org
      CC: linux-nfs@vger.kernel.org
      CC: linux-x25@vger.kernel.org
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      9b2c45d4
  12. 12 2月, 2018 1 次提交
    • L
      vfs: do bulk POLL* -> EPOLL* replacement · a9a08845
      Linus Torvalds 提交于
      This is the mindless scripted replacement of kernel use of POLL*
      variables as described by Al, done by this script:
      
          for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do
              L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'`
              for f in $L; do sed -i "-es/^\([^\"]*\)\(\<POLL$V\>\)/\\1E\\2/" $f; done
          done
      
      with de-mangling cleanups yet to come.
      
      NOTE! On almost all architectures, the EPOLL* constants have the same
      values as the POLL* constants do.  But they keyword here is "almost".
      For various bad reasons they aren't the same, and epoll() doesn't
      actually work quite correctly in some cases due to this on Sparc et al.
      
      The next patch from Al will sort out the final differences, and we
      should be all done.
      Scripted-by: NAl Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      a9a08845
  13. 01 2月, 2018 1 次提交
  14. 30 1月, 2018 1 次提交
  15. 11 1月, 2018 1 次提交
    • J
      vhost_net: batch used ring update in rx · e2b3b35e
      Jason Wang 提交于
      This patch tries to batched used ring update during RX. This is pretty
      fit for the case when guest is much faster (e.g dpdk based
      backend). In this case, used ring is almost empty:
      
      - we may get serious cache line misses/contending on both used ring
        and used idx.
      - at most 1 packet could be dequeued at one time, batching in guest
        does not make much effect.
      
      Update used ring in a batch can help since guest won't access the used
      ring until used idx was advanced for several descriptors and since we
      advance used ring for every N packets, guest will only need to access
      used idx for every N packet since it can cache the used idx. To have a
      better interaction for both batch dequeuing and dpdk batching,
      VHOST_RX_BATCH was used as the maximum number of descriptors that
      could be batched.
      
      Test were done between two machines with 2.40GHz Intel(R) Xeon(R) CPU
      E5-2630 connected back to back through ixgbe. Traffic were generated
      on one remote ixgbe through MoonGen and measure the RX pps through
      testpmd in guest when do xdp_redirect_map from local ixgbe to
      tap. RX pps were increased from 3.05 Mpps to 4.00 Mpps (about 31%
      improvement).
      
      One possible concern for this is the implications for TCP (especially
      latency sensitive workload). Result[1] does not show obvious changes
      for most of the netperf test (RR, TX, and RX). And we do get some
      improvements for RX on some specific size.
      
      Guest RX:
      
      size/sessions/+thu%/+normalize%
         64/     1/   +2%/   +2%
         64/     2/   +2%/   -1%
         64/     4/   +1%/   +1%
         64/     8/    0%/    0%
        256/     1/   +6%/   -3%
        256/     2/   -3%/   +2%
        256/     4/  +11%/  +11%
        256/     8/    0%/    0%
        512/     1/   +4%/    0%
        512/     2/   +2%/   +2%
        512/     4/    0%/   -1%
        512/     8/   -8%/   -8%
       1024/     1/   -7%/  -17%
       1024/     2/   -8%/   -7%
       1024/     4/   +1%/    0%
       1024/     8/    0%/    0%
       2048/     1/  +30%/  +14%
       2048/     2/  +46%/  +40%
       2048/     4/    0%/    0%
       2048/     8/    0%/    0%
       4096/     1/  +23%/  +22%
       4096/     2/  +26%/  +23%
       4096/     4/    0%/   +1%
       4096/     8/    0%/    0%
      16384/     1/   -2%/   -3%
      16384/     2/   +1%/   -4%
      16384/     4/   -1%/   -3%
      16384/     8/    0%/   -1%
      65535/     1/  +15%/   +7%
      65535/     2/   +4%/   +7%
      65535/     4/    0%/   +1%
      65535/     8/    0%/    0%
      
      TCP_RR:
      
      size/sessions/+thu%/+normalize%
          1/     1/    0%/   +1%
          1/    25/   +2%/   +1%
          1/    50/   +4%/   +1%
         64/     1/    0%/   -4%
         64/    25/   +2%/   +1%
         64/    50/    0%/   -1%
        256/     1/    0%/    0%
        256/    25/    0%/    0%
        256/    50/   +4%/   +2%
      
      Guest TX:
      
      size/sessions/+thu%/+normalize%
         64/     1/   +4%/   -2%
         64/     2/   -6%/   -5%
         64/     4/   +3%/   +6%
         64/     8/    0%/   +3%
        256/     1/  +15%/  +16%
        256/     2/  +11%/  +12%
        256/     4/   +1%/    0%
        256/     8/   +5%/   +5%
        512/     1/   -1%/   -6%
        512/     2/    0%/   -8%
        512/     4/   -2%/   +4%
        512/     8/   +6%/   +9%
       1024/     1/   +3%/   +1%
       1024/     2/   +3%/   +9%
       1024/     4/    0%/   +7%
       1024/     8/    0%/   +7%
       2048/     1/   +8%/   +2%
       2048/     2/   +3%/   -1%
       2048/     4/   -1%/  +11%
       2048/     8/   +3%/   +9%
       4096/     1/   +8%/   +8%
       4096/     2/    0%/   -7%
       4096/     4/   +4%/   +4%
       4096/     8/   +2%/   +5%
      16384/     1/   -3%/   +1%
      16384/     2/   -1%/  -12%
      16384/     4/   -1%/   +5%
      16384/     8/    0%/   +1%
      65535/     1/    0%/   -3%
      65535/     2/   +5%/  +16%
      65535/     4/   +1%/   +2%
      65535/     8/   +1%/   -1%
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Acked-by: NMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      e2b3b35e
  16. 09 1月, 2018 2 次提交
    • J
      tuntap: XDP transmission · fc72d1d5
      Jason Wang 提交于
      This patch implements XDP transmission for TAP. Since we can't create
      new queues for TAP during XDP set, exist ptr_ring was reused for
      queuing XDP buffers. To differ xdp_buff from sk_buff, TUN_XDP_FLAG
      (0x1UL) was encoded into lowest bit of xpd_buff pointer during
      ptr_ring_produce, and was decoded during consuming. XDP metadata was
      stored in the headroom of the packet which should work in most of
      cases since driver usually reserve enough headroom. Very minor changes
      were done for vhost_net: it just need to peek the length depends on
      the type of pointer.
      
      Tests were done on two Intel E5-2630 2.40GHz machines connected back
      to back through two 82599ES. Traffic were generated/received through
      MoonGen/testpmd(rxonly). It reports ~20% improvements when
      xdp_redirect_map is doing redirection from ixgbe to TAP (from 2.50Mpps
      to 3.05Mpps)
      
      Cc: Jesper Dangaard Brouer <brouer@redhat.com>
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      fc72d1d5
    • J
      tun/tap: use ptr_ring instead of skb_array · 5990a305
      Jason Wang 提交于
      This patch switches to use ptr_ring instead of skb_array. This will be
      used to enqueue different types of pointers by encoding type into
      lower bits.
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      5990a305
  17. 03 12月, 2017 1 次提交
  18. 29 11月, 2017 1 次提交
  19. 15 11月, 2017 1 次提交
    • J
      vhost_net: conditionally enable tx polling · feb8892c
      Jason Wang 提交于
      We always poll tx for socket, this is sub optimal since this will
      slightly increase the waitqueue traversing time and more important,
      vhost could not benefit from commit 9e641bdc ("net-tun:
      restructure tun_do_read for better sleep/wakeup efficiency") even if
      we've stopped rx polling during handle_rx(), tx poll were still left
      in the waitqueue.
      
      Pktgen from a remote host to VM over mlx4 on two 2.00GHz Xeon E5-2650
      shows 11.7% improvements on rx PPS. (from 1.28Mpps to 1.44Mpps)
      
      Cc: Wei Xu <wexu@redhat.com>
      Cc: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      feb8892c
  20. 10 10月, 2017 1 次提交
    • W
      vhost_net: do not stall on zerocopy depletion · 1e6f7453
      Willem de Bruijn 提交于
      Vhost-net has a hard limit on the number of zerocopy skbs in flight.
      When reached, transmission stalls. Stalls cause latency, as well as
      head-of-line blocking of other flows that do not use zerocopy.
      
      Instead of stalling, revert to copy-based transmission.
      
      Tested by sending two udp flows from guest to host, one with payload
      of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
      large flow is redirected to a netem instance with 1MBps rate limit
      and deep 1000 entry queue.
      
        modprobe ifb
        ip link set dev ifb0 up
        tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit
      
        tc qdisc add dev tap0 ingress
        tc filter add dev tap0 parent ffff: protocol ip \
            u32 match ip dport 8000 0xffff \
            action mirred egress redirect dev ifb0
      
      Before the delay, both flows process around 80K pps. With the delay,
      before this patch, both process around 400. After this patch, the
      large flow is still rate limited, while the small reverts to its
      original rate. See also discussion in the first link, below.
      
      Without rate limiting, {1, 10, 100}x TCP_STREAM tests continued to
      send at 100% zerocopy.
      
      The limit in vhost_exceeds_maxpend must be carefully chosen. With
      vq->num >> 1, the flows remain correlated. This value happens to
      correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
      fractions and ensure correctness also for much smaller values of
      vq->num, by testing the min() of both explicitly. See also the
      discussion in the second link below.
      
      Changes
        v1 -> v2
          - replaced min with typed min_t
          - avoid unnecessary whitespace change
      
      Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com
      Link:http://lkml.kernel.org/r/20170819064129.27272-1-den@klaipeden.comSigned-off-by: NWillem de Bruijn <willemb@google.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      1e6f7453
  21. 06 9月, 2017 1 次提交
    • J
      vhost_net: correctly check tx avail during rx busy polling · 8b949bef
      Jason Wang 提交于
      We check tx avail through vhost_enable_notify() in the past which is
      wrong since it only checks whether or not guest has filled more
      available buffer since last avail idx synchronization which was just
      done by vhost_vq_avail_empty() before. What we really want is checking
      pending buffers in the avail ring. Fix this by calling
      vhost_vq_avail_empty() instead.
      
      This issue could be noticed by doing netperf TCP_RR benchmark as
      client from guest (but not host). With this fix, TCP_RR from guest to
      localhost restores from 1375.91 trans per sec to 55235.28 trans per
      sec on my laptop (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz).
      
      Fixes: 03088137 ("vhost_net: basic polling support")
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      8b949bef
  22. 02 9月, 2017 1 次提交
  23. 04 8月, 2017 1 次提交
    • W
      sock: enable MSG_ZEROCOPY · 1f8b977a
      Willem de Bruijn 提交于
      Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with
      skb_zerocopy_clone() wherever needed due to skb split, merge, resize
      or clone.
      
      Split skb_orphan_frags into two variants. The split, merge, .. paths
      support reference counted zerocopy buffers, so do not do a deep copy.
      Add skb_orphan_frags_rx for paths that may loop packets to receive
      sockets. That is not allowed, as it may cause unbounded latency.
      Deep copy all zerocopy copy buffers, ref-counted or not, in this path.
      
      The exact locations to modify were chosen by exhaustively searching
      through all code that might modify skb_frag references and/or the
      the SKBTX_DEV_ZEROCOPY tx_flags bit.
      
      The changes err on the safe side, in two ways.
      
      (1) legacy ubuf_info paths virtio and tap are not modified. They keep
          a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags
          still call skb_copy_ubufs and thus copy frags in this case.
      
      (2) not all copies deep in the stack are addressed yet. skb_shift,
          skb_split and skb_try_coalesce can be refined to avoid copying.
          These are not in the hot path and this patch is hairy enough as
          is, so that is left for future refinement.
      Signed-off-by: NWillem de Bruijn <willemb@google.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      1f8b977a
  24. 13 7月, 2017 1 次提交
    • M
      mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic · dcda9b04
      Michal Hocko 提交于
      __GFP_REPEAT was designed to allow retry-but-eventually-fail semantic to
      the page allocator.  This has been true but only for allocations
      requests larger than PAGE_ALLOC_COSTLY_ORDER.  It has been always
      ignored for smaller sizes.  This is a bit unfortunate because there is
      no way to express the same semantic for those requests and they are
      considered too important to fail so they might end up looping in the
      page allocator for ever, similarly to GFP_NOFAIL requests.
      
      Now that the whole tree has been cleaned up and accidental or misled
      usage of __GFP_REPEAT flag has been removed for !costly requests we can
      give the original flag a better name and more importantly a more useful
      semantic.  Let's rename it to __GFP_RETRY_MAYFAIL which tells the user
      that the allocator would try really hard but there is no promise of a
      success.  This will work independent of the order and overrides the
      default allocator behavior.  Page allocator users have several levels of
      guarantee vs.  cost options (take GFP_KERNEL as an example)
      
       - GFP_KERNEL & ~__GFP_RECLAIM - optimistic allocation without _any_
         attempt to free memory at all. The most light weight mode which even
         doesn't kick the background reclaim. Should be used carefully because
         it might deplete the memory and the next user might hit the more
         aggressive reclaim
      
       - GFP_KERNEL & ~__GFP_DIRECT_RECLAIM (or GFP_NOWAIT)- optimistic
         allocation without any attempt to free memory from the current
         context but can wake kswapd to reclaim memory if the zone is below
         the low watermark. Can be used from either atomic contexts or when
         the request is a performance optimization and there is another
         fallback for a slow path.
      
       - (GFP_KERNEL|__GFP_HIGH) & ~__GFP_DIRECT_RECLAIM (aka GFP_ATOMIC) -
         non sleeping allocation with an expensive fallback so it can access
         some portion of memory reserves. Usually used from interrupt/bh
         context with an expensive slow path fallback.
      
       - GFP_KERNEL - both background and direct reclaim are allowed and the
         _default_ page allocator behavior is used. That means that !costly
         allocation requests are basically nofail but there is no guarantee of
         that behavior so failures have to be checked properly by callers
         (e.g. OOM killer victim is allowed to fail currently).
      
       - GFP_KERNEL | __GFP_NORETRY - overrides the default allocator behavior
         and all allocation requests fail early rather than cause disruptive
         reclaim (one round of reclaim in this implementation). The OOM killer
         is not invoked.
      
       - GFP_KERNEL | __GFP_RETRY_MAYFAIL - overrides the default allocator
         behavior and all allocation requests try really hard. The request
         will fail if the reclaim cannot make any progress. The OOM killer
         won't be triggered.
      
       - GFP_KERNEL | __GFP_NOFAIL - overrides the default allocator behavior
         and all allocation requests will loop endlessly until they succeed.
         This might be really dangerous especially for larger orders.
      
      Existing users of __GFP_REPEAT are changed to __GFP_RETRY_MAYFAIL
      because they already had their semantic.  No new users are added.
      __alloc_pages_slowpath is changed to bail out for __GFP_RETRY_MAYFAIL if
      there is no progress and we have already passed the OOM point.
      
      This means that all the reclaim opportunities have been exhausted except
      the most disruptive one (the OOM killer) and a user defined fallback
      behavior is more sensible than keep retrying in the page allocator.
      
      [akpm@linux-foundation.org: fix arch/sparc/kernel/mdesc.c]
      [mhocko@suse.com: semantic fix]
        Link: http://lkml.kernel.org/r/20170626123847.GM11534@dhcp22.suse.cz
      [mhocko@kernel.org: address other thing spotted by Vlastimil]
        Link: http://lkml.kernel.org/r/20170626124233.GN11534@dhcp22.suse.cz
      Link: http://lkml.kernel.org/r/20170623085345.11304-3-mhocko@kernel.orgSigned-off-by: NMichal Hocko <mhocko@suse.com>
      Acked-by: NVlastimil Babka <vbabka@suse.cz>
      Cc: Alex Belits <alex.belits@cavium.com>
      Cc: Chris Wilson <chris@chris-wilson.co.uk>
      Cc: Christoph Hellwig <hch@infradead.org>
      Cc: Darrick J. Wong <darrick.wong@oracle.com>
      Cc: David Daney <david.daney@cavium.com>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: NeilBrown <neilb@suse.com>
      Cc: Ralf Baechle <ralf@linux-mips.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      dcda9b04
  25. 18 5月, 2017 1 次提交
    • J
      vhost_net: try batch dequing from skb array · c67df11f
      Jason Wang 提交于
      We used to dequeue one skb during recvmsg() from skb_array, this could
      be inefficient because of the bad cache utilization and spinlock
      touching for each packet. This patch tries to batch them by calling
      batch dequeuing helpers explicitly on the exported skb array and pass
      the skb back through msg_control for underlayer socket to finish the
      userspace copying. Batch dequeuing is also the requirement for more
      batching improvement on receive path.
      
      Tests were done by pktgen on tap with XDP1 in guest. Host is Intel(R)
      Xeon(R) CPU E5-2650 0 @ 2.00GHz.
      
      rx batch | pps
      
      0   2.25Mpps
      1   2.33Mpps (+3.56%)
      4   2.33Mpps (+3.56%)
      16  2.35Mpps (+4.44%)
      64  2.42Mpps (+7.56%) <- Default rx batching
      128 2.40Mpps (+6.67%)
      256 2.38Mpps (+5.78%)
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      Signed-off-by: NDavid S. Miller <davem@davemloft.net>
      c67df11f
  26. 09 5月, 2017 1 次提交
  27. 02 3月, 2017 2 次提交
  28. 12 2月, 2017 1 次提交
  29. 19 1月, 2017 1 次提交
  30. 16 11月, 2016 1 次提交
  31. 02 8月, 2016 1 次提交
    • J
      vhost: new device IOTLB API · 6b1e6cc7
      Jason Wang 提交于
      This patch tries to implement an device IOTLB for vhost. This could be
      used with userspace(qemu) implementation of DMA remapping
      to emulate an IOMMU for the guest.
      
      The idea is simple, cache the translation in a software device IOTLB
      (which is implemented as an interval tree) in vhost and use vhost_net
      file descriptor for reporting IOTLB miss and IOTLB
      update/invalidation. When vhost meets an IOTLB miss, the fault
      address, size and access can be read from the file. After userspace
      finishes the translation, it writes the translated address to the
      vhost_net file to update the device IOTLB.
      
      When device IOTLB is enabled by setting VIRTIO_F_IOMMU_PLATFORM all vq
      addresses set by ioctl are treated as iova instead of virtual address and
      the accessing can only be done through IOTLB instead of direct userspace
      memory access. Before each round or vq processing, all vq metadata is
      prefetched in device IOTLB to make sure no translation fault happens
      during vq processing.
      
      In most cases, virtqueues are contiguous even in virtual address space.
      The IOTLB translation for virtqueue itself may make it a little
      slower. We might add fast path cache on top of this patch.
      Signed-off-by: NJason Wang <jasowang@redhat.com>
      [mst: use virtio feature bit: VHOST_F_DEVICE_IOTLB -> VIRTIO_F_IOMMU_PLATFORM ]
      [mst: fix build warnings ]
      Signed-off-by: NMichael S. Tsirkin <mst@redhat.com>
      [ weiyj.lk: missing unlock on error ]
      Signed-off-by: NWei Yongjun <weiyj.lk@gmail.com>
      6b1e6cc7