diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 8db8405c1e99e72a1e50206182bd17ad1219d47d..768085f5956645869e65aa5f81edb3f0b972a99b 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -232,7 +232,7 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) } } else { ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); - s = (ctrl->fence_size & 0x3f) << 4; + s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4; for (i = 64; i < s; i += 64) { wqe = buf + i; *wqe = cpu_to_be32(0xffffffff); @@ -264,7 +264,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); } ctrl->srcrb_flags = 0; - ctrl->fence_size = size / 16; + ctrl->qpn_vlan.fence_size = size / 16; /* * Make sure descriptor is fully written before setting ownership bit * (because HW can start executing as soon as we do). @@ -1992,7 +1992,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = cpu_to_be32(1 << 31); if (qp->sq_max_wqes_per_wr == 1) - ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + ctrl->qpn_vlan.fence_size = + 1 << (qp->sq.wqe_shift - 4); stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); } @@ -3169,8 +3170,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wmb(); *lso_wqe = lso_hdr_sz; - ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? - MLX4_WQE_CTRL_FENCE : 0) | size; + ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 51a2e8252b820bb76f84671cf0b88edf8ba7ce93..f32e272c83ddcebe1e7e633d5a8846d6c38d2ed1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1722,6 +1722,12 @@ static int mlx4_en_set_channels(struct net_device *dev, !channel->tx_count || !channel->rx_count) return -EINVAL; + if (channel->tx_count * MLX4_EN_NUM_UP <= priv->xdp_ring_num) { + en_err(priv, "Minimum %d tx channels required with XDP on\n", + priv->xdp_ring_num / MLX4_EN_NUM_UP + 1); + return -EINVAL; + } + mutex_lock(&mdev->state_lock); if (priv->port_up) { port_up = 1; @@ -1740,7 +1746,8 @@ static int mlx4_en_set_channels(struct net_device *dev, goto out; } - netif_set_real_num_tx_queues(dev, priv->tx_ring_num); + netif_set_real_num_tx_queues(dev, priv->tx_ring_num - + priv->xdp_ring_num); netif_set_real_num_rx_queues(dev, priv->rx_ring_num); if (dev->num_tc) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 6083775dae16175b874574d988a82c6b9320eada..9abbba6c147585c211efb793f82cc5f5cf996310 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -31,6 +31,7 @@ * */ +#include #include #include #include @@ -1521,6 +1522,24 @@ static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask); } +static void mlx4_en_init_recycle_ring(struct mlx4_en_priv *priv, + int tx_ring_idx) +{ + struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[tx_ring_idx]; + int rr_index; + + rr_index = (priv->xdp_ring_num - priv->tx_ring_num) + tx_ring_idx; + if (rr_index >= 0) { + tx_ring->free_tx_desc = mlx4_en_recycle_tx_desc; + tx_ring->recycle_ring = priv->rx_ring[rr_index]; + en_dbg(DRV, priv, + "Set tx_ring[%d]->recycle_ring = rx_ring[%d]\n", + tx_ring_idx, rr_index); + } else { + tx_ring->recycle_ring = NULL; + } +} + int mlx4_en_start_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -1643,6 +1662,8 @@ int mlx4_en_start_port(struct net_device *dev) } tx_ring->tx_queue = netdev_get_tx_queue(dev, i); + mlx4_en_init_recycle_ring(priv, i); + /* Arm CQ for TX completions */ mlx4_en_arm_cq(priv, cq); @@ -2112,6 +2133,11 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) en_err(priv, "Bad MTU size:%d.\n", new_mtu); return -EPERM; } + if (priv->xdp_ring_num && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) { + en_err(priv, "MTU size:%d requires frags but XDP running\n", + new_mtu); + return -EOPNOTSUPP; + } dev->mtu = new_mtu; if (netif_running(dev)) { @@ -2520,6 +2546,103 @@ static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m return err; } +static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct bpf_prog *old_prog; + int xdp_ring_num; + int port_up = 0; + int err; + int i; + + xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0; + + /* No need to reconfigure buffers when simply swapping the + * program for a new one. + */ + if (priv->xdp_ring_num == xdp_ring_num) { + if (prog) { + prog = bpf_prog_add(prog, priv->rx_ring_num - 1); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + for (i = 0; i < priv->rx_ring_num; i++) { + /* This xchg is paired with READ_ONCE in the fastpath */ + old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + } + return 0; + } + + if (priv->num_frags > 1) { + en_err(priv, "Cannot set XDP if MTU requires multiple frags\n"); + return -EOPNOTSUPP; + } + + if (priv->tx_ring_num < xdp_ring_num + MLX4_EN_NUM_UP) { + en_err(priv, + "Minimum %d tx channels required to run XDP\n", + (xdp_ring_num + MLX4_EN_NUM_UP) / MLX4_EN_NUM_UP); + return -EINVAL; + } + + if (prog) { + prog = bpf_prog_add(prog, priv->rx_ring_num - 1); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev, 1); + } + + priv->xdp_ring_num = xdp_ring_num; + netif_set_real_num_tx_queues(dev, priv->tx_ring_num - + priv->xdp_ring_num); + + for (i = 0; i < priv->rx_ring_num; i++) { + old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + } + + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) { + en_err(priv, "Failed starting port %d for XDP change\n", + priv->port); + queue_work(mdev->workqueue, &priv->watchdog_task); + } + } + + mutex_unlock(&mdev->state_lock); + return 0; +} + +static bool mlx4_xdp_attached(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + return !!priv->xdp_ring_num; +} + +static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return mlx4_xdp_set(dev, xdp->prog); + case XDP_QUERY_PROG: + xdp->prog_attached = mlx4_xdp_attached(dev); + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops mlx4_netdev_ops = { .ndo_open = mlx4_en_open, .ndo_stop = mlx4_en_close, @@ -2548,6 +2671,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, + .ndo_xdp = mlx4_xdp, }; static const struct net_device_ops mlx4_netdev_ops_master = { @@ -2584,6 +2708,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, + .ndo_xdp = mlx4_xdp, }; struct mlx4_en_bond { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index c1b3a9c8cf3b4db9412e722e09983e72c64e3caa..11d88c817137f6753721570f8c4682c619d94091 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -32,6 +32,7 @@ */ #include +#include #include #include #include @@ -57,7 +58,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv, struct page *page; dma_addr_t dma; - for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) { + for (order = frag_info->order; ;) { gfp_t gfp = _gfp; if (order) @@ -70,7 +71,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv, return -ENOMEM; } dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order, - PCI_DMA_FROMDEVICE); + frag_info->dma_dir); if (dma_mapping_error(priv->ddev, dma)) { put_page(page); return -ENOMEM; @@ -124,7 +125,8 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, while (i--) { if (page_alloc[i].page != ring_alloc[i].page) { dma_unmap_page(priv->ddev, page_alloc[i].dma, - page_alloc[i].page_size, PCI_DMA_FROMDEVICE); + page_alloc[i].page_size, + priv->frag_info[i].dma_dir); page = page_alloc[i].page; /* Revert changes done by mlx4_alloc_pages */ page_ref_sub(page, page_alloc[i].page_size / @@ -145,7 +147,7 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv, if (next_frag_end > frags[i].page_size) dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size, - PCI_DMA_FROMDEVICE); + frag_info->dma_dir); if (frags[i].page) put_page(frags[i].page); @@ -176,7 +178,8 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv, page_alloc = &ring->page_alloc[i]; dma_unmap_page(priv->ddev, page_alloc->dma, - page_alloc->page_size, PCI_DMA_FROMDEVICE); + page_alloc->page_size, + priv->frag_info[i].dma_dir); page = page_alloc->page; /* Revert changes done by mlx4_alloc_pages */ page_ref_sub(page, page_alloc->page_size / @@ -201,7 +204,7 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, i, page_count(page_alloc->page)); dma_unmap_page(priv->ddev, page_alloc->dma, - page_alloc->page_size, PCI_DMA_FROMDEVICE); + page_alloc->page_size, frag_info->dma_dir); while (page_alloc->page_offset + frag_info->frag_stride < page_alloc->page_size) { put_page(page_alloc->page); @@ -244,6 +247,12 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_alloc *frags = ring->rx_info + (index << priv->log_rx_info); + if (ring->page_cache.index > 0) { + frags[0] = ring->page_cache.buf[--ring->page_cache.index]; + rx_desc->data[0].addr = cpu_to_be64(frags[0].dma); + return 0; + } + return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp); } @@ -502,6 +511,24 @@ void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv) } } +/* When the rx ring is running in page-per-packet mode, a released frame can go + * directly into a small cache, to avoid unmapping or touching the page + * allocator. In bpf prog performance scenarios, buffers are either forwarded + * or dropped, never converted to skbs, so every page can come directly from + * this cache when it is sized to be a multiple of the napi budget. + */ +bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring, + struct mlx4_en_rx_alloc *frame) +{ + struct mlx4_en_page_cache *cache = &ring->page_cache; + + if (cache->index >= MLX4_EN_CACHE_SIZE) + return false; + + cache->buf[cache->index++] = *frame; + return true; +} + void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, u16 stride) @@ -509,6 +536,8 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring = *pring; + if (ring->xdp_prog) + bpf_prog_put(ring->xdp_prog); mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE); vfree(ring->rx_info); ring->rx_info = NULL; @@ -522,6 +551,16 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { + int i; + + for (i = 0; i < ring->page_cache.index; i++) { + struct mlx4_en_rx_alloc *frame = &ring->page_cache.buf[i]; + + dma_unmap_page(priv->ddev, frame->dma, frame->page_size, + priv->frag_info[0].dma_dir); + put_page(frame->page); + } + ring->page_cache.index = 0; mlx4_en_free_rx_buf(priv, ring); if (ring->stride <= TXBB_SIZE) ring->buf -= TXBB_SIZE; @@ -743,7 +782,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; struct mlx4_en_rx_alloc *frags; struct mlx4_en_rx_desc *rx_desc; + struct bpf_prog *xdp_prog; + int doorbell_pending; struct sk_buff *skb; + int tx_index; int index; int nr; unsigned int length; @@ -759,6 +801,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud if (budget <= 0) return polled; + xdp_prog = READ_ONCE(ring->xdp_prog); + doorbell_pending = 0; + tx_index = (priv->tx_ring_num - priv->xdp_ring_num) + cq->ring; + /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx * descriptor offset can be deduced from the CQE index instead of * reading 'cqe->index' */ @@ -835,6 +881,43 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) && (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL)); + /* A bpf program gets first chance to drop the packet. It may + * read bytes but not past the end of the frag. + */ + if (xdp_prog) { + struct xdp_buff xdp; + dma_addr_t dma; + u32 act; + + dma = be64_to_cpu(rx_desc->data[0].addr); + dma_sync_single_for_cpu(priv->ddev, dma, + priv->frag_info[0].frag_size, + DMA_FROM_DEVICE); + + xdp.data = page_address(frags[0].page) + + frags[0].page_offset; + xdp.data_end = xdp.data + length; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + if (!mlx4_en_xmit_frame(frags, dev, + length, tx_index, + &doorbell_pending)) + goto consumed; + break; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + case XDP_DROP: + if (mlx4_en_rx_recycle(ring, frags)) + goto consumed; + goto next; + } + } + if (likely(dev->features & NETIF_F_RXCSUM)) { if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP | MLX4_CQE_STATUS_UDP)) { @@ -986,6 +1069,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud for (nr = 0; nr < priv->num_frags; nr++) mlx4_en_free_frag(priv, frags, nr); +consumed: ++cq->mcq.cons_index; index = (cq->mcq.cons_index) & ring->size_mask; cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor; @@ -994,6 +1078,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud } out: + if (doorbell_pending) + mlx4_en_xmit_doorbell(priv->tx_ring[tx_index]); + AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mlx4_cq_set_ci(&cq->mcq); wmb(); /* ensure HW sees CQ consumer before we post new buffers */ @@ -1061,22 +1148,35 @@ static const int frag_sizes[] = { void mlx4_en_calc_rx_buf(struct net_device *dev) { + enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE; struct mlx4_en_priv *priv = netdev_priv(dev); - /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple - * headers. (For example: ETH_P_8021Q and ETH_P_8021AD). - */ - int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN); + int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu); + int order = MLX4_EN_ALLOC_PREFER_ORDER; + u32 align = SMP_CACHE_BYTES; int buf_size = 0; int i = 0; + /* bpf requires buffers to be set up as 1 packet per page. + * This only works when num_frags == 1. + */ + if (priv->xdp_ring_num) { + dma_dir = PCI_DMA_BIDIRECTIONAL; + /* This will gain efficient xdp frame recycling at the expense + * of more costly truesize accounting + */ + align = PAGE_SIZE; + order = 0; + } + while (buf_size < eff_mtu) { + priv->frag_info[i].order = order; priv->frag_info[i].frag_size = (eff_mtu > buf_size + frag_sizes[i]) ? frag_sizes[i] : eff_mtu - buf_size; priv->frag_info[i].frag_prefix_size = buf_size; priv->frag_info[i].frag_stride = - ALIGN(priv->frag_info[i].frag_size, - SMP_CACHE_BYTES); + ALIGN(priv->frag_info[i].frag_size, align); + priv->frag_info[i].dma_dir = dma_dir; buf_size += priv->frag_info[i].frag_size; i++; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 76aa4d27183c7d9527141ec168fd347a5d1b0d86..9df87ca0515a2b9d0b523c176c88e44e2ab3fe60 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -196,6 +196,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, ring->last_nr_txbb = 1; memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->buf, 0, ring->buf_size); + ring->free_tx_desc = mlx4_en_free_tx_desc; ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8); @@ -265,10 +266,10 @@ static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, } -static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, - int index, u8 owner, u64 timestamp, - int napi_mode) +u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, u64 timestamp, + int napi_mode) { struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; @@ -344,6 +345,27 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, return tx_info->nr_txbb; } +u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, u64 timestamp, + int napi_mode) +{ + struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; + struct mlx4_en_rx_alloc frame = { + .page = tx_info->page, + .dma = tx_info->map0_dma, + .page_offset = 0, + .page_size = PAGE_SIZE, + }; + + if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { + dma_unmap_page(priv->ddev, tx_info->map0_dma, + PAGE_SIZE, priv->frag_info[0].dma_dir); + put_page(tx_info->page); + } + + return tx_info->nr_txbb; +} int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) { @@ -362,7 +384,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) } while (ring->cons != ring->prod) { - ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, + ring->last_nr_txbb = ring->free_tx_desc(priv, ring, ring->cons & ring->size_mask, !!(ring->cons & ring->size), 0, 0 /* Non-NAPI caller */); @@ -444,7 +466,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, timestamp = mlx4_en_get_cqe_ts(cqe); /* free next descriptor */ - last_nr_txbb = mlx4_en_free_tx_desc( + last_nr_txbb = ring->free_tx_desc( priv, ring, ring_index, !!((ring_cons + txbbs_skipped) & ring->size), timestamp, napi_budget); @@ -476,6 +498,9 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; + if (ring->free_tx_desc == mlx4_en_recycle_tx_desc) + return done < budget; + netdev_tx_completed_queue(ring->tx_queue, packets, bytes); /* Wakeup Tx queue if this stopped, and ring is not full. @@ -631,8 +656,7 @@ static int get_real_size(const struct sk_buff *skb, static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, const struct sk_buff *skb, const struct skb_shared_info *shinfo, - int real_size, u16 *vlan_tag, - int tx_ind, void *fragptr) + void *fragptr) { struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; @@ -700,10 +724,66 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src, __iowrite64_copy(dst, src, bytecnt / 8); } +void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring) +{ + wmb(); + /* Since there is no iowrite*_native() that writes the + * value as is, without byteswapping - using the one + * the doesn't do byteswapping in the relevant arch + * endianness. + */ +#if defined(__LITTLE_ENDIAN) + iowrite32( +#else + iowrite32be( +#endif + ring->doorbell_qpn, + ring->bf.uar->map + MLX4_SEND_DOORBELL); +} + +static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring, + struct mlx4_en_tx_desc *tx_desc, + union mlx4_wqe_qpn_vlan qpn_vlan, + int desc_size, int bf_index, + __be32 op_own, bool bf_ok, + bool send_doorbell) +{ + tx_desc->ctrl.qpn_vlan = qpn_vlan; + + if (bf_ok) { + op_own |= htonl((bf_index & 0xffff) << 8); + /* Ensure new descriptor hits memory + * before setting ownership of this descriptor to HW + */ + dma_wmb(); + tx_desc->ctrl.owner_opcode = op_own; + + wmb(); + + mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, + desc_size); + + wmb(); + + ring->bf.offset ^= ring->bf.buf_size; + } else { + /* Ensure new descriptor hits memory + * before setting ownership of this descriptor to HW + */ + dma_wmb(); + tx_desc->ctrl.owner_opcode = op_own; + if (send_doorbell) + mlx4_en_xmit_doorbell(ring); + else + ring->xmit_more++; + } +} + netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct mlx4_en_priv *priv = netdev_priv(dev); + union mlx4_wqe_qpn_vlan qpn_vlan = {}; struct device *ddev = priv->ddev; struct mlx4_en_tx_ring *ring; struct mlx4_en_tx_desc *tx_desc; @@ -715,7 +795,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) int real_size; u32 index, bf_index; __be32 op_own; - u16 vlan_tag = 0; u16 vlan_proto = 0; int i_frag; int lso_header_size; @@ -725,6 +804,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) bool stop_queue; bool inline_ok; u32 ring_cons; + bool bf_ok; tx_ind = skb_get_queue_mapping(skb); ring = priv->tx_ring[tx_ind]; @@ -749,9 +829,17 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_drop; } + bf_ok = ring->bf_enabled; if (skb_vlan_tag_present(skb)) { - vlan_tag = skb_vlan_tag_get(skb); + qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb)); vlan_proto = be16_to_cpu(skb->vlan_proto); + if (vlan_proto == ETH_P_8021AD) + qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN; + else if (vlan_proto == ETH_P_8021Q) + qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN; + else + qpn_vlan.ins_vlan = 0; + bf_ok = false; } netdev_txq_bql_enqueue_prefetchw(ring->tx_queue); @@ -771,6 +859,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) else { tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf; bounce = true; + bf_ok = false; } /* Save skb in tx_info ring */ @@ -907,8 +996,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); if (tx_info->inl) - build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag, - tx_ind, fragptr); + build_inline_wqe(tx_desc, skb, shinfo, fragptr); if (skb->encapsulation) { union { @@ -946,60 +1034,15 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) real_size = (real_size / 16) & 0x3f; - if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && - !skb_vlan_tag_present(skb) && send_doorbell) { - tx_desc->ctrl.bf_qpn = ring->doorbell_qpn | - cpu_to_be32(real_size); - - op_own |= htonl((bf_index & 0xffff) << 8); - /* Ensure new descriptor hits memory - * before setting ownership of this descriptor to HW - */ - dma_wmb(); - tx_desc->ctrl.owner_opcode = op_own; - - wmb(); + bf_ok &= desc_size <= MAX_BF && send_doorbell; - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, - desc_size); - - wmb(); - - ring->bf.offset ^= ring->bf.buf_size; - } else { - tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - if (vlan_proto == ETH_P_8021AD) - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN; - else if (vlan_proto == ETH_P_8021Q) - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN; - else - tx_desc->ctrl.ins_vlan = 0; - - tx_desc->ctrl.fence_size = real_size; + if (bf_ok) + qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size); + else + qpn_vlan.fence_size = real_size; - /* Ensure new descriptor hits memory - * before setting ownership of this descriptor to HW - */ - dma_wmb(); - tx_desc->ctrl.owner_opcode = op_own; - if (send_doorbell) { - wmb(); - /* Since there is no iowrite*_native() that writes the - * value as is, without byteswapping - using the one - * the doesn't do byteswapping in the relevant arch - * endianness. - */ -#if defined(__LITTLE_ENDIAN) - iowrite32( -#else - iowrite32be( -#endif - ring->doorbell_qpn, - ring->bf.uar->map + MLX4_SEND_DOORBELL); - } else { - ring->xmit_more++; - } - } + mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index, + op_own, bf_ok, send_doorbell); if (unlikely(stop_queue)) { /* If queue was emptied after the if (stop_queue) , and before @@ -1034,3 +1077,106 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame, + struct net_device *dev, unsigned int length, + int tx_ind, int *doorbell_pending) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + union mlx4_wqe_qpn_vlan qpn_vlan = {}; + struct mlx4_en_tx_ring *ring; + struct mlx4_en_tx_desc *tx_desc; + struct mlx4_wqe_data_seg *data; + struct mlx4_en_tx_info *tx_info; + int index, bf_index; + bool send_doorbell; + int nr_txbb = 1; + bool stop_queue; + dma_addr_t dma; + int real_size; + __be32 op_own; + u32 ring_cons; + bool bf_ok; + + BUILD_BUG_ON_MSG(ALIGN(CTRL_SIZE + DS_SIZE, TXBB_SIZE) != TXBB_SIZE, + "mlx4_en_xmit_frame requires minimum size tx desc"); + + ring = priv->tx_ring[tx_ind]; + + if (!priv->port_up) + goto tx_drop; + + if (mlx4_en_is_tx_ring_full(ring)) + goto tx_drop; + + /* fetch ring->cons far ahead before needing it to avoid stall */ + ring_cons = READ_ONCE(ring->cons); + + index = ring->prod & ring->size_mask; + tx_info = &ring->tx_info[index]; + + bf_ok = ring->bf_enabled; + + /* Track current inflight packets for performance analysis */ + AVG_PERF_COUNTER(priv->pstats.inflight_avg, + (u32)(ring->prod - ring_cons - 1)); + + bf_index = ring->prod; + tx_desc = ring->buf + index * TXBB_SIZE; + data = &tx_desc->data; + + dma = frame->dma; + + tx_info->page = frame->page; + frame->page = NULL; + tx_info->map0_dma = dma; + tx_info->map0_byte_count = length; + tx_info->nr_txbb = nr_txbb; + tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN); + tx_info->data_offset = (void *)data - (void *)tx_desc; + tx_info->ts_requested = 0; + tx_info->nr_maps = 1; + tx_info->linear = 1; + tx_info->inl = 0; + + dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE); + + data->addr = cpu_to_be64(dma); + data->lkey = ring->mr_key; + dma_wmb(); + data->byte_count = cpu_to_be32(length); + + /* tx completion can avoid cache line miss for common cases */ + tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; + + op_own = cpu_to_be32(MLX4_OPCODE_SEND) | + ((ring->prod & ring->size) ? + cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); + + ring->packets++; + ring->bytes += tx_info->nr_bytes; + AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length); + + ring->prod += nr_txbb; + + stop_queue = mlx4_en_is_tx_ring_full(ring); + send_doorbell = stop_queue || + *doorbell_pending > MLX4_EN_DOORBELL_BUDGET; + bf_ok &= send_doorbell; + + real_size = ((CTRL_SIZE + nr_txbb * DS_SIZE) / 16) & 0x3f; + + if (bf_ok) + qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size); + else + qpn_vlan.fence_size = real_size; + + mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, bf_index, + op_own, bf_ok, send_doorbell); + *doorbell_pending = send_doorbell ? 0 : *doorbell_pending + 1; + + return NETDEV_TX_OK; + +tx_drop: + ring->tx_dropped++; + return NETDEV_TX_BUSY; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index d39bf594abe4ae37709ce8e01c1a11eb360d9f0d..29c81d26f9f54c4abde53657faeb126bc5989aa0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -132,6 +132,7 @@ enum { MLX4_EN_NUM_UP) #define MLX4_EN_DEFAULT_TX_WORK 256 +#define MLX4_EN_DOORBELL_BUDGET 8 /* Target number of packets to coalesce with interrupt moderation */ #define MLX4_EN_RX_COAL_TARGET 44 @@ -164,6 +165,10 @@ enum { #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN) #define MLX4_EN_MIN_MTU 46 +/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple + * headers. (For example: ETH_P_8021Q and ETH_P_8021AD). + */ +#define MLX4_EN_EFF_MTU(mtu) ((mtu) + ETH_HLEN + (2 * VLAN_HLEN)) #define ETH_BCAST 0xffffffffffffULL #define MLX4_EN_LOOPBACK_RETRIES 5 @@ -215,7 +220,10 @@ enum cq_type { struct mlx4_en_tx_info { - struct sk_buff *skb; + union { + struct sk_buff *skb; + struct page *page; + }; dma_addr_t map0_dma; u32 map0_byte_count; u32 nr_txbb; @@ -255,6 +263,14 @@ struct mlx4_en_rx_alloc { u32 page_size; }; +#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT) +struct mlx4_en_page_cache { + u32 index; + struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE]; +}; + +struct mlx4_en_priv; + struct mlx4_en_tx_ring { /* cache line used and dirtied in tx completion * (mlx4_en_free_tx_buf()) @@ -288,6 +304,11 @@ struct mlx4_en_tx_ring { __be32 mr_key; void *buf; struct mlx4_en_tx_info *tx_info; + struct mlx4_en_rx_ring *recycle_ring; + u32 (*free_tx_desc)(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, + u64 timestamp, int napi_mode); u8 *bounce_buf; struct mlx4_qp_context context; int qpn; @@ -319,6 +340,8 @@ struct mlx4_en_rx_ring { u8 fcs_del; void *buf; void *rx_info; + struct bpf_prog *xdp_prog; + struct mlx4_en_page_cache page_cache; unsigned long bytes; unsigned long packets; unsigned long csum_ok; @@ -438,7 +461,9 @@ struct mlx4_en_mc_list { struct mlx4_en_frag_info { u16 frag_size; u16 frag_prefix_size; - u16 frag_stride; + u32 frag_stride; + enum dma_data_direction dma_dir; + int order; }; #ifdef CONFIG_MLX4_EN_DCB @@ -558,6 +583,7 @@ struct mlx4_en_priv { struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; u16 num_frags; u16 log_rx_info; + int xdp_ring_num; struct mlx4_en_tx_ring **tx_ring; struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS]; @@ -663,6 +689,12 @@ void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, void *accel_priv, select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); +netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame, + struct net_device *dev, unsigned int length, + int tx_ind, int *doorbell_pending); +void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring); +bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring, + struct mlx4_en_rx_alloc *frame); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, @@ -691,6 +723,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev, int budget); int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget); +u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, u64 timestamp, + int napi_mode); +u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, u64 timestamp, + int napi_mode); void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, int is_tx, int rss, int qpn, int cqn, int user_prio, struct mlx4_qp_context *context); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c13e92b00bf580d4145e0eb853f0a3cb422e7fa1..75a5ae6bee074f9c665e141ab2a6dcd76c5f7d3f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -224,6 +224,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fc31ef1da2d8e9efe5d91b8e4349ed4fe0dd809..15d816a8b7551883f21dedbe65905a12f17baa02 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -368,6 +368,11 @@ struct bpf_skb_data_end { void *data_end; }; +struct xdp_buff { + void *data; + void *data_end; +}; + /* compute the linear packet data range [data, data_end) which * will be accessed by cls_bpf and act_bpf programs */ @@ -429,6 +434,18 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return BPF_PROG_RUN(prog, skb); } +static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) +{ + u32 ret; + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, (void *)xdp); + rcu_read_unlock(); + + return ret; +} + static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), @@ -509,6 +526,7 @@ bool bpf_helper_changes_skb_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 587cdf943b524abd88fa945844b1e5b209b7fe2e..deaa2217214d2de36da51410e252d382be4ff04b 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -291,16 +291,18 @@ enum { MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0, }; +union mlx4_wqe_qpn_vlan { + struct { + __be16 vlan_tag; + u8 ins_vlan; + u8 fence_size; + }; + __be32 bf_qpn; +}; + struct mlx4_wqe_ctrl_seg { __be32 owner_opcode; - union { - struct { - __be16 vlan_tag; - u8 ins_vlan; - u8 fence_size; - }; - __be32 bf_qpn; - }; + union mlx4_wqe_qpn_vlan qpn_vlan; /* * High 24 bits are SRC remote buffer; low 8 bits are flags: * [7] SO (strong ordering) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49736a31acaa38846f175a305545a4083c540af5..fab9a1c2a2ac06324eb6686123bcc12774b0d302 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -63,6 +63,7 @@ struct wpan_dev; struct mpls_dev; /* UDP Tunnel offloads */ struct udp_tunnel_info; +struct bpf_prog; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -799,6 +800,33 @@ struct tc_to_netdev { }; }; +/* These structures hold the attributes of xdp state that are being passed + * to the netdevice through the xdp op. + */ +enum xdp_netdev_command { + /* Set or clear a bpf program used in the earliest stages of packet + * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee + * is responsible for calling bpf_prog_put on any old progs that are + * stored. In case of error, the callee need not release the new prog + * reference, but on success it takes ownership and must bpf_prog_put + * when it is no longer used. + */ + XDP_SETUP_PROG, + /* Check if a bpf program is set on the device. The callee should + * return true if a program is currently attached and running. + */ + XDP_QUERY_PROG, +}; + +struct netdev_xdp { + enum xdp_netdev_command command; + union { + /* XDP_SETUP_PROG */ + struct bpf_prog *prog; + /* XDP_QUERY_PROG */ + bool prog_attached; + }; +}; /* * This structure defines the management hooks for network devices. @@ -1087,6 +1115,9 @@ struct tc_to_netdev { * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. + * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp); + * This function is used to set or query state related to XDP on the + * netdevice. See definition of enum xdp_netdev_command for details. * */ struct net_device_ops { @@ -1271,6 +1302,8 @@ struct net_device_ops { struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); + int (*ndo_xdp)(struct net_device *dev, + struct netdev_xdp *xdp); }; /** @@ -3257,6 +3290,7 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); +int dev_change_xdp_fd(struct net_device *dev, int fd); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4d922439d20433cdeeb4828f2b87847e5f573f2..2b7076f5b5ad3465d7f30ba898771621acbd13a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, }; #define BPF_PSEUDO_MAP_FD 1 @@ -439,4 +440,24 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will result + * in packet drop. + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4285ac31e865d6c3a97a8aee402c73478b8f8b43..a1b5202c5f6b69e81f8321af46cc3e536bf5770d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -156,6 +156,7 @@ enum { IFLA_GSO_MAX_SEGS, IFLA_GSO_MAX_SIZE, IFLA_PAD, + IFLA_XDP, __IFLA_MAX }; @@ -843,4 +844,15 @@ enum { }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) +/* XDP section */ + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 96d938a2205016f17223e76851b6e9b4b1a881f3..228f962447a508f50ec2a2f81f83bdb1f9368e77 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); + if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_sub(i, &prog->aux->refcnt); return ERR_PTR(-EBUSY); } return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_add); + +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e206c2181412a128aece1598311beb8762f6edb2..f72f23b8fdab42b8e2c3add264bb2680f93bf940 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff +static bool may_write_pkt_data(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_XDP: + return true; + default: + return false; + } +} + static int check_packet_access(struct verifier_env *env, u32 regno, int off, int size) { @@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, switch (env->prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: break; default: verbose("verifier is misconfigured\n"); @@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE) { + if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { verbose("cannot write into packet\n"); return -EACCES; } + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); + return -EACCES; + } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); diff --git a/net/core/dev.c b/net/core/dev.c index 7894e406c80684754fac5d144f2b40a18897f153..2a9c39f8824e027cbf0ea344f64273bdf971dce9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -6614,6 +6615,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @fd: new program fd or negative value to clear + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, int fd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct bpf_prog *prog = NULL; + struct netdev_xdp xdp = {}; + int err; + + if (!ops->ndo_xdp) + return -EOPNOTSUPP; + if (fd >= 0) { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + xdp.command = XDP_SETUP_PROG; + xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); + if (err < 0 && prog) + bpf_prog_put(prog); + + return err; +} +EXPORT_SYMBOL(dev_change_xdp_fd); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/filter.c b/net/core/filter.c index 22e3992c8b48a87dcc6250ce914871748d2948f7..6c627bc4be6e29647c16ff78efa9902f29af8069 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2410,6 +2410,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id) +{ + return sk_filter_func_proto(func_id); +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2477,6 +2483,44 @@ static bool tc_cls_act_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != 4) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_md, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size, type); +} + +void bpf_warn_invalid_xdp_action(u32 act) +{ + WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2628,6 +2672,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data_end)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -2640,6 +2707,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .convert_ctx_access = bpf_net_convert_ctx_access, }; +static const struct bpf_verifier_ops xdp_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2655,11 +2728,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_ACT, }; +static struct bpf_prog_type_list xdp_type __read_mostly = { + .ops = &xdp_ops, + .type = BPF_PROG_TYPE_XDP, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&xdp_type); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a9e3805af739c9b8d87fba66ed305fa00ced4e65..eba2b8260dbdf6d78a6ee312e3dec59a0b4eab43 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -891,6 +891,16 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } +static size_t rtnl_xdp_size(const struct net_device *dev) +{ + size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */ + + if (!dev->netdev_ops->ndo_xdp) + return 0; + else + return xdp_size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -927,6 +937,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + + rtnl_xdp_size(dev) /* IFLA_XDP */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1211,6 +1222,33 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_xdp xdp_op = {}; + struct nlattr *xdp; + int err; + + if (!dev->netdev_ops->ndo_xdp) + return 0; + xdp = nla_nest_start(skb, IFLA_XDP); + if (!xdp) + return -EMSGSIZE; + xdp_op.command = XDP_QUERY_PROG; + err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); + if (err) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + if (err) + goto err_cancel; + + nla_nest_end(skb, xdp); + return 0; + +err_cancel: + nla_nest_cancel(skb, xdp); + return err; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1307,6 +1345,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; + if (rtnl_xdp_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; @@ -1392,6 +1433,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, + [IFLA_XDP] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1429,6 +1471,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, +}; + static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; @@ -2054,6 +2101,23 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_NOTIFY; } + if (tb[IFLA_XDP]) { + struct nlattr *xdp[IFLA_XDP_MAX + 1]; + + err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], + ifla_xdp_policy); + if (err < 0) + goto errout; + + if (xdp[IFLA_XDP_FD]) { + err = dev_change_xdp_fd(dev, + nla_get_s32(xdp[IFLA_XDP_FD])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a98b780e974ca7e6be3d68b963a75340d177009b..d2d2b35c67eb12b308d85724c88f28943d8ea228 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -21,6 +21,8 @@ hostprogs-y += spintest hostprogs-y += map_perf_test hostprogs-y += test_overhead hostprogs-y += test_cgrp2_array_pin +hostprogs-y += xdp1 +hostprogs-y += xdp2 test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -42,6 +44,9 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o +xdp1-objs := bpf_load.o libbpf.o xdp1_user.o +# reuse xdp1 source intentionally +xdp2-objs := bpf_load.o libbpf.o xdp1_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -64,6 +69,8 @@ always += test_overhead_tp_kern.o always += test_overhead_kprobe_kern.o always += parse_varlen.o parse_simple.o parse_ldabs.o always += test_cgrp2_tc_kern.o +always += xdp1_kern.o +always += xdp2_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -84,6 +91,8 @@ HOSTLOADLIBES_offwaketime += -lelf HOSTLOADLIBES_spintest += -lelf HOSTLOADLIBES_map_perf_test += -lelf -lrt HOSTLOADLIBES_test_overhead += -lelf -lrt +HOSTLOADLIBES_xdp1 += -lelf +HOSTLOADLIBES_xdp2 += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 022af71c2bb5c0e894562e93cfecb1b857920e77..0cfda23203206e5a764c9bdee14d1b9c616f62bb 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -50,6 +50,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_xdp = strncmp(event, "xdp", 3) == 0; enum bpf_prog_type prog_type; char buf[256]; int fd, efd, err, id; @@ -66,6 +67,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_KPROBE; } else if (is_tracepoint) { prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_xdp) { + prog_type = BPF_PROG_TYPE_XDP; } else { printf("Unknown event '%s'\n", event); return -1; @@ -79,6 +82,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; + if (is_xdp) + return 0; + if (is_socket) { event += 6; if (*event != '/') @@ -319,6 +325,7 @@ int load_bpf_file(char *path) if (memcmp(shname_prog, "kprobe/", 7) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 || memcmp(shname_prog, "tracepoint/", 11) == 0 || + memcmp(shname_prog, "xdp", 3) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -336,6 +343,7 @@ int load_bpf_file(char *path) if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..e7dd8ac40d122da6ecb1546a3d839e622dd15c0d --- /dev/null +++ b/samples/bpf/xdp1_kern.c @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") dropcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 index; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + index = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + index = parse_ipv6(data, nh_off, data_end); + else + index = 0; + + value = bpf_map_lookup_elem(&dropcnt, &index); + if (value) + *value += 1; + + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c new file mode 100644 index 0000000000000000000000000000000000000000..a5e109e398a1f8fb08b6cc1f42190852ada841e2 --- /dev/null +++ b/samples/bpf/xdp1_user.c @@ -0,0 +1,181 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_load.h" +#include "libbpf.h" + +static int set_link_xdp_fd(int ifindex, int fd) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; + + nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); + nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len; + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + printf("send to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + printf("recv from netlink: %s\n", strerror(errno)); + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != getpid()) { + printf("Wrong pid %d, expected %d\n", + nh->nlmsg_pid, getpid()); + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + printf("Wrong seq %d, expected %d\n", + nh->nlmsg_seq, seq); + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + printf("nlmsg error %s\n", strerror(-err->error)); + goto cleanup; + case NLMSG_DONE: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} + +static int ifindex; + +static void int_exit(int sig) +{ + set_link_xdp_fd(ifindex, -1); + exit(0); +} + +/* simple per-protocol drop counter + */ +static void poll_stats(int interval) +{ + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + const unsigned int nr_keys = 256; + __u64 values[nr_cpus], prev[nr_keys][nr_cpus]; + __u32 key; + int i; + + memset(prev, 0, sizeof(prev)); + + while (1) { + sleep(interval); + + for (key = 0; key < nr_keys; key++) { + __u64 sum = 0; + + assert(bpf_lookup_elem(map_fd[0], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[key][i]); + if (sum) + printf("proto %u: %10llu pkt/s\n", + key, sum / interval); + memcpy(prev[key], values, sizeof(values)); + } + } +} + +int main(int ac, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (ac != 2) { + printf("usage: %s IFINDEX\n", argv[0]); + return 1; + } + + ifindex = strtoul(argv[1], NULL, 0); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + signal(SIGINT, int_exit); + + if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + poll_stats(2); + + return 0; +} diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..38fe7e1d0db48678f03fc406e7ebbde78794001c --- /dev/null +++ b/samples/bpf/xdp2_kern.c @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") dropcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 index; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + index = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + index = parse_ipv6(data, nh_off, data_end); + else + index = 0; + + value = bpf_map_lookup_elem(&dropcnt, &index); + if (value) + *value += 1; + + if (index == 17) { + swap_src_dst_mac(data); + rc = XDP_TX; + } + + return rc; +} + +char _license[] SEC("license") = "GPL";