提交 60bbf7ee 编写于 作者: J Jesper Dangaard Brouer 提交者: David S. Miller

mlx5: use page_pool for xdp_return_frame call

This patch shows how it is possible to have both the driver local page
cache, which uses elevated refcnt for "catching"/avoiding SKB
put_page returns the page through the page allocator.  And at the
same time, have pages getting returned to the page_pool from
ndp_xdp_xmit DMA completion.

The performance improvement for XDP_REDIRECT in this patch is really
good.  Especially considering that (currently) the xdp_return_frame
API and page_pool_put_page() does per frame operations of both
rhashtable ID-lookup and locked return into (page_pool) ptr_ring.
(It is the plan to remove these per frame operation in a followup
patchset).

The benchmark performed was RX on mlx5 and XDP_REDIRECT out ixgbe,
with xdp_redirect_map (using devmap) . And the target/maximum
capability of ixgbe is 13Mpps (on this HW setup).

Before this patch for mlx5, XDP redirected frames were returned via
the page allocator.  The single flow performance was 6Mpps, and if I
started two flows the collective performance drop to 4Mpps, because we
hit the page allocator lock (further negative scaling occurs).

Two test scenarios need to be covered, for xdp_return_frame API, which
is DMA-TX completion running on same-CPU or cross-CPU free/return.
Results were same-CPU=10Mpps, and cross-CPU=12Mpps.  This is very
close to our 13Mpps max target.

The reason max target isn't reached in cross-CPU test, is likely due
to RX-ring DMA unmap/map overhead (which doesn't occur in ixgbe to
ixgbe testing).  It is also planned to remove this unnecessary DMA
unmap in a later patchset

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes not return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.
 - Save a branch in mlx5e_page_release
 - Correct page_pool size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V5: Updated patch desc

V8: Adjust for b0cedc84 ("net/mlx5e: Remove rq_headroom field from params")
V9:
 - Adjust for 121e8927 ("net/mlx5e: Refactor RQ XDP_TX indication")
 - Adjust for 73281b78 ("net/mlx5e: Derive Striding RQ size from MTU")
 - Correct handling if page_pool_create fail for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V10: Req from Tariq
 - Change pool_size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ
Signed-off-by: NJesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: NTariq Toukan <tariqt@mellanox.com>
Acked-by: NSaeed Mahameed <saeedm@mellanox.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 57d0a1c1
...@@ -53,6 +53,8 @@ ...@@ -53,6 +53,8 @@
#include "mlx5_core.h" #include "mlx5_core.h"
#include "en_stats.h" #include "en_stats.h"
struct page_pool;
#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
...@@ -534,6 +536,7 @@ struct mlx5e_rq { ...@@ -534,6 +536,7 @@ struct mlx5e_rq {
unsigned int hw_mtu; unsigned int hw_mtu;
struct mlx5e_xdpsq xdpsq; struct mlx5e_xdpsq xdpsq;
DECLARE_BITMAP(flags, 8); DECLARE_BITMAP(flags, 8);
struct page_pool *page_pool;
/* control */ /* control */
struct mlx5_wq_ctrl wq_ctrl; struct mlx5_wq_ctrl wq_ctrl;
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <linux/mlx5/fs.h> #include <linux/mlx5/fs.h>
#include <net/vxlan.h> #include <net/vxlan.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <net/page_pool.h>
#include "eswitch.h" #include "eswitch.h"
#include "en.h" #include "en.h"
#include "en_tc.h" #include "en_tc.h"
...@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_rq_param *rqp, struct mlx5e_rq_param *rqp,
struct mlx5e_rq *rq) struct mlx5e_rq *rq)
{ {
struct page_pool_params pp_params = { 0 };
struct mlx5_core_dev *mdev = c->mdev; struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc; void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
u32 byte_count; u32 byte_count, pool_size;
int npages; int npages;
int wq_sz; int wq_sz;
int err; int err;
...@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params); rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) { switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
rq->post_wqes = mlx5e_post_rx_mpwqes; rq->post_wqes = mlx5e_post_rx_mpwqes;
rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
...@@ -512,13 +517,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -512,13 +517,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->mkey_be = c->mkey_be; rq->mkey_be = c->mkey_be;
} }
/* This must only be activate for order-0 pages */ /* Create a page_pool and register it with rxq */
if (rq->xdp_prog) { pp_params.order = rq->buff.page_order;
pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
pp_params.pool_size = pool_size;
pp_params.nid = cpu_to_node(c->cpu);
pp_params.dev = c->pdev;
pp_params.dma_dir = rq->buff.map_dir;
/* page_pool can be used even when there is no rq->xdp_prog,
* given page_pool does not handle DMA mapping there is no
* required state to clear. And page_pool gracefully handle
* elevated refcnt.
*/
rq->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rq->page_pool)) {
if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
kfree(rq->wqe.frag_info);
err = PTR_ERR(rq->page_pool);
rq->page_pool = NULL;
goto err_rq_wq_destroy;
}
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_ORDER0, NULL); MEM_TYPE_PAGE_POOL, rq->page_pool);
if (err) if (err)
goto err_rq_wq_destroy; goto err_rq_wq_destroy;
}
for (i = 0; i < wq_sz; i++) { for (i = 0; i < wq_sz; i++) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
...@@ -556,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -556,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
if (rq->xdp_prog) if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog); bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq); xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl); mlx5_wq_destroy(&rq->wq_ctrl);
return err; return err;
...@@ -569,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) ...@@ -569,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
bpf_prog_put(rq->xdp_prog); bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq); xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);
switch (rq->wq_type) { switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include <linux/bpf_trace.h> #include <linux/bpf_trace.h>
#include <net/busy_poll.h> #include <net/busy_poll.h>
#include <net/ip6_checksum.h> #include <net/ip6_checksum.h>
#include <net/page_pool.h>
#include "en.h" #include "en.h"
#include "en_tc.h" #include "en_tc.h"
#include "eswitch.h" #include "eswitch.h"
...@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, ...@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
if (mlx5e_rx_cache_get(rq, dma_info)) if (mlx5e_rx_cache_get(rq, dma_info))
return 0; return 0;
dma_info->page = dev_alloc_pages(rq->buff.page_order); dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
if (unlikely(!dma_info->page)) if (unlikely(!dma_info->page))
return -ENOMEM; return -ENOMEM;
...@@ -246,11 +247,16 @@ static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, ...@@ -246,11 +247,16 @@ static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
bool recycle) bool recycle)
{ {
if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info)) if (likely(recycle)) {
if (mlx5e_rx_cache_put(rq, dma_info))
return; return;
mlx5e_page_dma_unmap(rq, dma_info);
page_pool_recycle_direct(rq->page_pool, dma_info->page);
} else {
mlx5e_page_dma_unmap(rq, dma_info); mlx5e_page_dma_unmap(rq, dma_info);
put_page(dma_info->page); put_page(dma_info->page);
}
} }
static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq, static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册