net/mlx5e: Remove page_ref bulking in Striding RQ

When many packets reside on the same page, the bulking of page_ref modifications reduces the total number of atomic operations executed. Besides the necessary 2 operations on page alloc/free, we have the following extra ops per page: - one on WQE allocation (bump refcnt to maximum possible), - zero ops for SKBs, - one on WQE free, a constant of two operations in total, no matter how many packets/SKBs actually populate the page. Without this bulking, we have: - no ops on WQE allocation or free, - one op per SKB, Comparing the two methods when PAGE_SIZE is 4K: - As mentioned above, bulking method always executes 2 operations, not more, but not less. - In the default MTU configuration (1500, stride size is 2K), the non-bulking method execute 2 ops as well. - For larger MTUs with stride size of 4K, non-bulking method executes only a single op. - For XDP (stride size of 4K, no SKBs), non-bulking method executes no ops at all! Hence, to optimize the flows with linear SKB and XDP over Striding RQ, we here remove the page_ref bulking method. Performance testing: ConnectX-5, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz. Single core packet rate (64 bytes). Early drop in TC: no degradation. XDP_DROP: before: 14,270,188 pps after: 20,503,603 pps, 43% improvement. Signed-off-by: N Tariq Toukan <tariqt@mellanox.com> Signed-off-by: N Saeed Mahameed <saeedm@mellanox.com>

net/mlx5e: Remove page_ref bulking in Striding RQ
When many packets reside on the same page, the bulking of page_ref modifications reduces the total number of atomic operations executed. Besides the necessary 2 operations on page alloc/free, we have the following extra ops per page: - one on WQE allocation (bump refcnt to maximum possible), - zero ops for SKBs, - one on WQE free, a constant of two operations in total, no matter how many packets/SKBs actually populate the page. Without this bulking, we have: - no ops on WQE allocation or free, - one op per SKB, Comparing the two methods when PAGE_SIZE is 4K: - As mentioned above, bulking method always executes 2 operations, not more, but not less. - In the default MTU configuration (1500, stride size is 2K), the non-bulking method execute 2 ops as well. - For larger MTUs with stride size of 4K, non-bulking method executes only a single op. - For XDP (stride size of 4K, no SKBs), non-bulking method executes no ops at all! Hence, to optimize the flows with linear SKB and XDP over Striding RQ, we here remove the page_ref bulking method. Performance testing: ConnectX-5, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz. Single core packet rate (64 bytes). Early drop in TC: no degradation. XDP_DROP: before: 14,270,188 pps after: 20,503,603 pps, 43% improvement. Signed-off-by: N Tariq Toukan <tariqt@mellanox.com> Signed-off-by: N Saeed Mahameed <saeedm@mellanox.com>
9f9e9cd5 · Tariq Toukan · Saeed Mahameed · 22f45398 · 9f9e9cd5 · 9f9e9cd5
Showing with 16 addition and 32 deletion

drivers/net/ethernet/mellanox/mlx5/core/en.h drivers/net/ethernet/mellanox/mlx5/core/en.h +0 -1

drivers/net/ethernet/mellanox/mlx5/core/en_rx.c drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +16 -31

未找到文件。
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -456,7 +456,6 @@ struct mlx5e_umr_dma_info {
 struct mlx5e_mpw_info {
 	struct mlx5e_umr_dma_info umr;
 	u16 consumed_strides;
-	u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
 	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -296,37 +296,28 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 		mlx5e_free_rx_wqe(rq, wi);
 }
-static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
-{
-	return rq->mpwqe.num_strides >> MLX5_MPWRQ_WQE_PAGE_ORDER;
-}
 static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
 					    struct sk_buff *skb,
-					    struct mlx5e_mpw_info *wi,
+					    struct mlx5e_dma_info *di,
-					    u32 page_idx, u32 frag_offset,
+					    u32 frag_offset, u32 len)
-					    u32 len)
 {
 	unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
 	dma_sync_single_for_cpu(rq->pdev,
-				wi->umr.dma_info[page_idx].addr + frag_offset,
+				di->addr + frag_offset,
 				len, DMA_FROM_DEVICE);
-	wi->skbs_frags[page_idx]++;
+	page_ref_inc(di->page);
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
-			wi->umr.dma_info[page_idx].page, frag_offset,
+			di->page, frag_offset, len, truesize);
-			len, truesize);
 }
 static inline void
 mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 			    struct sk_buff *skb,
-			    struct mlx5e_mpw_info *wi,
+			    struct mlx5e_dma_info *dma_info,
-			    u32 page_idx, u32 offset,
+			    u32 offset, u32 headlen)
-			    u32 headlen)
 {
 	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
-	struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[page_idx];
 	unsigned int len;
 	 /* Aligning len to sizeof(long) optimizes memcpy performance */
@@ -351,15 +342,12 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 {
 	const bool no_xdp_xmit =
 		bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
-	int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
 	struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
 	int i;
-	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
-		page_ref_sub(dma_info[i].page, pg_strides - wi->skbs_frags[i]);
 		if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
 			mlx5e_page_release(rq, &dma_info[i], true);
-	}
 }
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
@@ -380,7 +368,6 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
 	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
-	int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
 	struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0];
 	struct mlx5e_icosq *sq = &rq->channel->icosq;
 	struct mlx5_wq_cyc *wq = &sq->wq;
@@ -403,10 +390,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		if (unlikely(err))
 			goto err_unmap;
 		umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
-		page_ref_add(dma_info->page, pg_strides);
 	}
-	memset(wi->skbs_frags, 0, sizeof(*wi->skbs_frags) * MLX5_MPWRQ_PAGES_PER_WQE);
 	bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 	wi->consumed_strides = 0;
@@ -425,7 +410,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 err_unmap:
 	while (--i >= 0) {
 		dma_info--;
-		page_ref_sub(dma_info->page, pg_strides);
 		mlx5e_page_release(rq, dma_info, true);
 	}
 	rq->stats.buff_alloc_err++;
@@ -987,9 +971,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
 {
 	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
 	u32 frag_offset    = head_offset + headlen;
-	u16 byte_cnt       = cqe_bcnt - headlen;
+	u32 byte_cnt       = cqe_bcnt - headlen;
-	u32 head_page_idx  = page_idx;
+	struct mlx5e_dma_info *head_di = di;
 	struct sk_buff *skb;
 	skb = napi_alloc_skb(rq->cq.napi,
@@ -1002,7 +987,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	prefetchw(skb->data);
 	if (unlikely(frag_offset >= PAGE_SIZE)) {
-		page_idx++;
+		di++;
 		frag_offset -= PAGE_SIZE;
 	}
@@ -1010,14 +995,14 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 		u32 pg_consumed_bytes =
 			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
-		mlx5e_add_skb_frag_mpwqe(rq, skb, wi, page_idx, frag_offset,
+		mlx5e_add_skb_frag_mpwqe(rq, skb, di, frag_offset,
 					 pg_consumed_bytes);
 		byte_cnt -= pg_consumed_bytes;
 		frag_offset = 0;
-		page_idx++;
+		di++;
 	}
 	/* copy header */
-	mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, wi, head_page_idx,
+	mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, head_di,
 				    head_offset, headlen);
 	/* skb linear part was allocated with headlen and aligned to long */
 	skb->tail += headlen;
@@ -1060,7 +1045,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 		return NULL;
 	/* queue up for recycling/reuse */
-	wi->skbs_frags[page_idx]++;
+	page_ref_inc(di->page);
 	return skb;
 }