staging/rdma/hfi1: Prevent silent data corruption with user SDMA

User SDMA keeps track of progress into the submitted IO vectors by tracking an offset into the vectors when packets are submitted. This offset is updated after a successful submission of a txreq to the SDMA engine. The same offset was used when determining whether an IO vector should be 'freed' (pages unpinned) in the SDMA callback functions. This was causing a silent data corruption in big jobs (> 2 nodes, 120 ranks each) on the receive side because the send side was mistakenly unpinning the vector pages before the HW has processed all descriptors referencing the vector. Reviewed-by: N Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: N Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: N Ira Weiny <ira.weiny@intel.com> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>

staging/rdma/hfi1: Prevent silent data corruption with user SDMA
User SDMA keeps track of progress into the submitted IO vectors by tracking an offset into the vectors when packets are submitted. This offset is updated after a successful submission of a txreq to the SDMA engine. The same offset was used when determining whether an IO vector should be 'freed' (pages unpinned) in the SDMA callback functions. This was causing a silent data corruption in big jobs (> 2 nodes, 120 ranks each) on the receive side because the send side was mistakenly unpinning the vector pages before the HW has processed all descriptors referencing the vector. Reviewed-by: N Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: N Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: N Ira Weiny <ira.weiny@intel.com> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>
b9fb6318 · Mitko Haralanov · Greg Kroah-Hartman · febffe2c · b9fb6318
显示空白变更内容
内联并排

Showing with 52 addition and 38 deletion

drivers/staging/rdma/hfi1/user_sdma.c drivers/staging/rdma/hfi1/user_sdma.c +52 -38

未找到文件。
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -146,7 +146,8 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 #define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
 /* Last packet in the request */
-#define USER_SDMA_TXREQ_FLAGS_LAST_PKT   (1 << 0)
+#define TXREQ_FLAGS_REQ_LAST_PKT   (1 << 0)
+#define TXREQ_FLAGS_IOVEC_LAST_PKT (1 << 0)
 #define SDMA_REQ_IN_USE     0
 #define SDMA_REQ_FOR_THREAD 1
@@ -249,13 +250,22 @@ struct user_sdma_request {
 	unsigned long flags;
 };
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
 struct user_sdma_txreq {
 	/* Packet header for the txreq */
 	struct hfi1_pkt_header hdr;
 	struct sdma_txreq txreq;
 	struct user_sdma_request *req;
-	struct user_sdma_iovec *iovec1;
+	struct {
-	struct user_sdma_iovec *iovec2;
+		struct user_sdma_iovec *vec;
+		u8 flags;
+	} iovecs[3];
+	int idx;
 	u16 flags;
 	unsigned busycount;
 	u64 seqnum;
@@ -294,21 +304,6 @@ static int defer_packet_queue(
 	unsigned seq);
 static void activate_packet_queue(struct iowait *, int);
-static inline int iovec_may_free(struct user_sdma_iovec *iovec,
-				       void (*free)(struct user_sdma_iovec *))
-{
-	if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
-		free(iovec);
-		return 1;
-	}
-	return 0;
-}
-static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
-{
-	iovec->offset = iovec->iov.iov_len;
-}
 static int defer_packet_queue(
 	struct sdma_engine *sde,
 	struct iowait *wait,
@@ -825,11 +820,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		tx->flags = 0;
 		tx->req = req;
 		tx->busycount = 0;
-		tx->iovec1 = NULL;
+		tx->idx = -1;
-		tx->iovec2 = NULL;
+		memset(tx->iovecs, 0, sizeof(tx->iovecs));
 		if (req->seqnum == req->info.npkts - 1)
-			tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
+			tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
 		/*
 		 * Calculate the payload size - this is min of the fragment
@@ -858,7 +853,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 					goto free_tx;
 			}
-			tx->iovec1 = iovec;
+			tx->iovecs[++tx->idx].vec = iovec;
 			datalen = compute_data_length(req, tx);
 			if (!datalen) {
 				SDMA_DBG(req,
@@ -948,10 +943,17 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 					      iovec->pages[pageidx],
 					      offset, len);
 			if (ret) {
+				int i;
 				dd_dev_err(pq->dd,
 					   "SDMA txreq add page failed %d\n",
 					   ret);
-				iovec_set_complete(iovec);
+				/* Mark all assigned vectors as complete so they
+				 * are unpinned in the callback. */
+				for (i = tx->idx; i >= 0; i--) {
+					tx->iovecs[i].flags |=
+						TXREQ_FLAGS_IOVEC_LAST_PKT;
+				}
 				goto free_txreq;
 			}
 			iov_offset += len;
@@ -959,8 +961,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			data_sent += len;
 			if (unlikely(queued < datalen &&
 				     pageidx == iovec->npages &&
-				     req->iov_idx < req->data_iovs - 1)) {
+				     req->iov_idx < req->data_iovs - 1 &&
+				     tx->idx < ARRAY_SIZE(tx->iovecs))) {
 				iovec->offset += iov_offset;
+				tx->iovecs[tx->idx].flags |=
+					TXREQ_FLAGS_IOVEC_LAST_PKT;
 				iovec = &req->iovs[++req->iov_idx];
 				if (!iovec->pages) {
 					ret = pin_vector_pages(req, iovec);
@@ -968,8 +973,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 						goto free_txreq;
 				}
 				iov_offset = 0;
-				tx->iovec2 = iovec;
+				tx->iovecs[++tx->idx].vec = iovec;
 			}
 		}
 		/*
@@ -981,11 +985,15 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			req->tidoffset += datalen;
 		req->sent += data_sent;
 		if (req->data_len) {
-			if (tx->iovec1 && !tx->iovec2)
+			tx->iovecs[tx->idx].vec->offset += iov_offset;
-				tx->iovec1->offset += iov_offset;
+			/* If we've reached the end of the io vector, mark it
-			else if (tx->iovec2)
+			 * so the callback can unpin the pages and free it. */
-				tx->iovec2->offset += iov_offset;
+			if (tx->iovecs[tx->idx].vec->offset ==
+			    tx->iovecs[tx->idx].vec->iov.iov_len)
+				tx->iovecs[tx->idx].flags |=
+					TXREQ_FLAGS_IOVEC_LAST_PKT;
 		}
 		/*
 		 * It is important to increment this here as it is used to
 		 * generate the BTH.PSN and, therefore, can't be bulk-updated
@@ -1214,7 +1222,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 				req->seqnum));
 	/* Set ACK request on last packet */
-	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 		hdr->bth[2] |= cpu_to_be32(1UL<<31);
 	/* Set the new offset */
@@ -1246,7 +1254,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
 			  EXP_TID_GET(tidval, IDX));
 		/* Clear KDETH.SH only on the last packet */
-		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
 		/*
 		 * Set the KDETH.OFFSET and KDETH.OM based on size of
@@ -1290,7 +1298,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	/* BTH.PSN and BTH.A */
 	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
-	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 		val32 |= 1UL << 31;
 	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
 	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
@@ -1331,7 +1339,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
 					(EXP_TID_GET(tidval, IDX) & 0x3ff));
 		/* Clear KDETH.SH on last packet */
-		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
 			val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
 								INTR) >> 16);
 			val &= cpu_to_le16(~(1U << 13));
@@ -1358,10 +1366,16 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
 	if (unlikely(!req || !pq))
 		return;
-	if (tx->iovec1)
+	/* If we have any io vectors associated with this txreq,
-		iovec_may_free(tx->iovec1, unpin_vector_pages);
+	 * check whether they need to be 'freed'. */
-	if (tx->iovec2)
+	if (tx->idx != -1) {
-		iovec_may_free(tx->iovec2, unpin_vector_pages);
+		int i;
+		for (i = tx->idx; i >= 0; i--) {
+			if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
+				unpin_vector_pages(tx->iovecs[i].vec);
+		}
+	}
 	tx_seqnum = tx->seqnum;
 	kmem_cache_free(pq->txreq_cache, tx);