diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 41408f82afe87016f4b92cebe0fcbf36c9505040..55fe02ef37cbcd8c7dd45a3db7baadd123651587 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -213,12 +213,6 @@ struct user_sdma_request {
 	 * to 0.
 	 */
 	u8 omfactor;
-	/*
-	 * pointer to the user's task_struct. We are going to
-	 * get a reference to it so we can process io vectors
-	 * at a later time.
-	 */
-	struct task_struct *user_proc;
 	/*
 	 * pointer to the user's mm_struct. We are going to
 	 * get a reference to it so it doesn't get freed
@@ -245,9 +239,13 @@ struct user_sdma_request {
 	u16 tididx;
 	u32 sent;
 	u64 seqnum;
-	spinlock_t list_lock;
 	struct list_head txps;
+	spinlock_t txcmp_lock;  /* protect txcmp list */
+	struct list_head txcmp;
 	unsigned long flags;
+	/* status of the last txreq completed */
+	int status;
+	struct work_struct worker;
 };
 
 /*
@@ -260,6 +258,7 @@ struct user_sdma_txreq {
 	/* Packet header for the txreq */
 	struct hfi1_pkt_header hdr;
 	struct sdma_txreq txreq;
+	struct list_head list;
 	struct user_sdma_request *req;
 	struct {
 		struct user_sdma_iovec *vec;
@@ -282,10 +281,12 @@ struct user_sdma_txreq {
 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
 static int num_user_pages(const struct iovec *);
 static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
+static void user_sdma_delayed_completion(struct work_struct *);
 static void user_sdma_free_request(struct user_sdma_request *);
 static int pin_vector_pages(struct user_sdma_request *,
 			    struct user_sdma_iovec *);
-static void unpin_vector_pages(struct user_sdma_iovec *);
+static void unpin_vector_pages(struct user_sdma_request *,
+			       struct user_sdma_iovec *);
 static int check_header_template(struct user_sdma_request *,
 				 struct hfi1_pkt_header *, u32, u32);
 static int set_txreq_header(struct user_sdma_request *,
@@ -391,6 +392,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 	pq->state = SDMA_PKT_Q_INACTIVE;
 	atomic_set(&pq->n_reqs, 0);
+	init_waitqueue_head(&pq->wait);
 
 	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
 		    activate_packet_queue);
@@ -451,26 +453,16 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
 		  uctxt->ctxt, fd->subctxt);
 	pq = fd->pq;
 	if (pq) {
-		u16 i, j;
-
 		spin_lock_irqsave(&uctxt->sdma_qlock, flags);
 		if (!list_empty(&pq->list))
 			list_del_init(&pq->list);
 		spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
 		iowait_sdma_drain(&pq->busy);
-		if (pq->reqs) {
-			for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) &&
-				     j < pq->n_max_reqs; j++) {
-				struct user_sdma_request *req = &pq->reqs[j];
-
-				if (test_bit(SDMA_REQ_IN_USE, &req->flags)) {
-					set_comp_state(req, ERROR, -ECOMM);
-					user_sdma_free_request(req);
-					i++;
-				}
-			}
-			kfree(pq->reqs);
-		}
+		/* Wait until all requests have been freed. */
+		wait_event_interruptible(
+			pq->wait,
+			(ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+		kfree(pq->reqs);
 		kmem_cache_destroy(pq->txreq_cache);
 		kfree(pq);
 		fd->pq = NULL;
@@ -544,8 +536,12 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 	req->data_iovs = req_iovcnt(info.ctrl) - 1;
 	req->pq = pq;
 	req->cq = cq;
+	req->status = -1;
 	INIT_LIST_HEAD(&req->txps);
-	spin_lock_init(&req->list_lock);
+	INIT_LIST_HEAD(&req->txcmp);
+	INIT_WORK(&req->worker, user_sdma_delayed_completion);
+
+	spin_lock_init(&req->txcmp_lock);
 	memcpy(&req->info, &info, sizeof(info));
 
 	if (req_opcode(info.ctrl) == EXPECTED)
@@ -685,18 +681,16 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 	sent = user_sdma_send_pkts(req, pcount);
 	if (unlikely(sent < 0)) {
 		if (sent != -EBUSY) {
-			ret = sent;
-			goto send_err;
+			req->status = sent;
+			set_comp_state(req, ERROR, req->status);
+			return sent;
 		} else
 			sent = 0;
 	}
 	atomic_inc(&pq->n_reqs);
+	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 
 	if (sent < req->info.npkts) {
-		/* Take the references to the user's task and mm_struct */
-		get_task_struct(current);
-		req->user_proc = current;
-
 		/*
 		 * This is a somewhat blocking send implementation.
 		 * The driver will block the caller until all packets of the
@@ -706,8 +700,10 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 		while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
 			ret = user_sdma_send_pkts(req, pcount);
 			if (ret < 0) {
-				if (ret != -EBUSY)
-					goto send_err;
+				if (ret != -EBUSY) {
+					req->status = ret;
+					return ret;
+				}
 				wait_event_interruptible_timeout(
 					pq->busy.wait_dma,
 					(pq->state == SDMA_PKT_Q_ACTIVE),
@@ -717,14 +713,10 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 		}
 
 	}
-	ret = 0;
 	*count += idx;
-	goto done;
-send_err:
-	set_comp_state(req, ERROR, ret);
+	return 0;
 free_req:
 	user_sdma_free_request(req);
-done:
 	return ret;
 }
 
@@ -825,6 +817,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		tx->req = req;
 		tx->busycount = 0;
 		tx->idx = -1;
+		INIT_LIST_HEAD(&tx->list);
 		memset(tx->iovecs, 0, sizeof(tx->iovecs));
 
 		if (req->seqnum == req->info.npkts - 1)
@@ -949,9 +942,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			if (ret) {
 				int i;
 
-				dd_dev_err(pq->dd,
-					   "SDMA txreq add page failed %d\n",
-					   ret);
+				SDMA_DBG(req, "SDMA txreq add page failed %d\n",
+					 ret);
 				/* Mark all assigned vectors as complete so they
 				 * are unpinned in the callback. */
 				for (i = tx->idx; i >= 0; i--) {
@@ -1045,52 +1037,58 @@ static inline int num_user_pages(const struct iovec *iov)
 
 static int pin_vector_pages(struct user_sdma_request *req,
 			    struct user_sdma_iovec *iovec) {
-	int ret = 0;
-	unsigned pinned;
+	int pinned, npages;
 
-	iovec->npages = num_user_pages(&iovec->iov);
-	iovec->pages = kcalloc(iovec->npages, sizeof(*iovec->pages),
-			       GFP_KERNEL);
+	npages = num_user_pages(&iovec->iov);
+	iovec->pages = kcalloc(npages, sizeof(*iovec->pages), GFP_KERNEL);
 	if (!iovec->pages) {
 		SDMA_DBG(req, "Failed page array alloc");
-		ret = -ENOMEM;
-		goto done;
+		return -ENOMEM;
 	}
-	/* If called by the kernel thread, use the user's mm */
-	if (current->flags & PF_KTHREAD)
-		use_mm(req->user_proc->mm);
-	pinned = get_user_pages_fast(
-		(unsigned long)iovec->iov.iov_base,
-		iovec->npages, 0, iovec->pages);
-	/* If called by the kernel thread, unuse the user's mm */
-	if (current->flags & PF_KTHREAD)
-		unuse_mm(req->user_proc->mm);
-	if (pinned != iovec->npages) {
-		SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned,
-			 iovec->npages);
-		ret = -EFAULT;
-		goto pfree;
+
+	/*
+	 * Get a reference to the process's mm so we can use it when
+	 * unpinning the io vectors.
+	 */
+	req->pq->user_mm = get_task_mm(current);
+
+	pinned = hfi1_acquire_user_pages((unsigned long)iovec->iov.iov_base,
+					 npages, 0, iovec->pages);
+
+	if (pinned < 0)
+		return pinned;
+
+	iovec->npages = pinned;
+	if (pinned != npages) {
+		SDMA_DBG(req, "Failed to pin pages (%d/%u)", pinned, npages);
+		unpin_vector_pages(req, iovec);
+		return -EFAULT;
 	}
-	goto done;
-pfree:
-	unpin_vector_pages(iovec);
-done:
-	return ret;
+	return 0;
 }
 
-static void unpin_vector_pages(struct user_sdma_iovec *iovec)
+static void unpin_vector_pages(struct user_sdma_request *req,
+			       struct user_sdma_iovec *iovec)
 {
-	unsigned i;
+	/*
+	 * Unpinning is done through the workqueue so use the
+	 * process's mm if we have a reference to it.
+	 */
+	if ((current->flags & PF_KTHREAD) && req->pq->user_mm)
+		use_mm(req->pq->user_mm);
 
-	if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) {
-		hfi1_cdbg(SDMA,
-			  "the complete vector has not been sent yet %llu %zu",
-			  iovec->offset, iovec->iov.iov_len);
-		return;
+	hfi1_release_user_pages(iovec->pages, iovec->npages, 0);
+
+	/*
+	 * Unuse the user's mm (see above) and release the
+	 * reference to it.
+	 */
+	if (req->pq->user_mm) {
+		if (current->flags & PF_KTHREAD)
+			unuse_mm(req->pq->user_mm);
+		mmput(req->pq->user_mm);
 	}
-	for (i = 0; i < iovec->npages; i++)
-		if (iovec->pages[i])
-			put_page(iovec->pages[i]);
+
 	kfree(iovec->pages);
 	iovec->pages = NULL;
 	iovec->npages = 0;
@@ -1358,54 +1356,116 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	return diff;
 }
 
+/*
+ * SDMA tx request completion callback. Called when the SDMA progress
+ * state machine gets notification that the SDMA descriptors for this
+ * tx request have been processed by the DMA engine. Called in
+ * interrupt context.
+ */
 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
 			       int drain)
 {
 	struct user_sdma_txreq *tx =
 		container_of(txreq, struct user_sdma_txreq, txreq);
-	struct user_sdma_request *req = tx->req;
-	struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL;
-	u64 tx_seqnum;
+	struct user_sdma_request *req;
+	bool defer;
+	int i;
 
-	if (unlikely(!req || !pq))
+	if (!tx->req)
 		return;
 
-	/* If we have any io vectors associated with this txreq,
-	 * check whether they need to be 'freed'. */
-	if (tx->idx != -1) {
-		int i;
+	req = tx->req;
+	/*
+	 * If this is the callback for the last packet of the request,
+	 * queue up the request for clean up.
+	 */
+	defer = (tx->seqnum == req->info.npkts - 1);
 
-		for (i = tx->idx; i >= 0; i--) {
-			if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
-				unpin_vector_pages(tx->iovecs[i].vec);
+	/*
+	 * If we have any io vectors associated with this txreq,
+	 * check whether they need to be 'freed'. We can't free them
+	 * here because the unpin function needs to be able to sleep.
+	 */
+	for (i = tx->idx; i >= 0; i--) {
+		if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT) {
+			defer = true;
+			break;
 		}
 	}
 
-	tx_seqnum = tx->seqnum;
-	kmem_cache_free(pq->txreq_cache, tx);
-
+	req->status = status;
 	if (status != SDMA_TXREQ_S_OK) {
-		dd_dev_err(pq->dd, "SDMA completion with error %d", status);
-		set_comp_state(req, ERROR, status);
+		SDMA_DBG(req, "SDMA completion with error %d",
+			 status);
 		set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
-		/* Do not free the request until the sender loop has ack'ed
-		 * the error and we've seen all txreqs. */
-		if (tx_seqnum == ACCESS_ONCE(req->seqnum) &&
-		    test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
-			atomic_dec(&pq->n_reqs);
-			user_sdma_free_request(req);
-		}
+		defer = true;
+	}
+
+	/*
+	 * Defer the clean up of the iovectors and the request until later
+	 * so it can be done outside of interrupt context.
+	 */
+	if (defer) {
+		spin_lock(&req->txcmp_lock);
+		list_add_tail(&tx->list, &req->txcmp);
+		spin_unlock(&req->txcmp_lock);
+		schedule_work(&req->worker);
 	} else {
-		if (tx_seqnum == req->info.npkts - 1) {
-			/* We've sent and completed all packets in this
-			 * request. Signal completion to the user */
-			atomic_dec(&pq->n_reqs);
-			set_comp_state(req, COMPLETE, 0);
-			user_sdma_free_request(req);
+		kmem_cache_free(req->pq->txreq_cache, tx);
+	}
+}
+
+static void user_sdma_delayed_completion(struct work_struct *work)
+{
+	struct user_sdma_request *req =
+		container_of(work, struct user_sdma_request, worker);
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct user_sdma_txreq *tx = NULL;
+	unsigned long flags;
+	u64 seqnum;
+	int i;
+
+	while (1) {
+		spin_lock_irqsave(&req->txcmp_lock, flags);
+		if (!list_empty(&req->txcmp)) {
+			tx = list_first_entry(&req->txcmp,
+					      struct user_sdma_txreq, list);
+			list_del(&tx->list);
+		}
+		spin_unlock_irqrestore(&req->txcmp_lock, flags);
+		if (!tx)
+			break;
+
+		for (i = tx->idx; i >= 0; i--)
+			if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
+				unpin_vector_pages(req, tx->iovecs[i].vec);
+
+		seqnum = tx->seqnum;
+		kmem_cache_free(pq->txreq_cache, tx);
+		tx = NULL;
+
+		if (req->status != SDMA_TXREQ_S_OK) {
+			if (seqnum == ACCESS_ONCE(req->seqnum) &&
+			    test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
+				atomic_dec(&pq->n_reqs);
+				set_comp_state(req, ERROR, req->status);
+				user_sdma_free_request(req);
+				break;
+			}
+		} else {
+			if (seqnum == req->info.npkts - 1) {
+				atomic_dec(&pq->n_reqs);
+				set_comp_state(req, COMPLETE, 0);
+				user_sdma_free_request(req);
+				break;
+			}
 		}
 	}
-	if (!atomic_read(&pq->n_reqs))
+
+	if (!atomic_read(&pq->n_reqs)) {
 		xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+		wake_up(&pq->wait);
+	}
 }
 
 static void user_sdma_free_request(struct user_sdma_request *req)
@@ -1426,10 +1486,8 @@ static void user_sdma_free_request(struct user_sdma_request *req)
 
 		for (i = 0; i < req->data_iovs; i++)
 			if (req->iovs[i].npages && req->iovs[i].pages)
-				unpin_vector_pages(&req->iovs[i]);
+				unpin_vector_pages(req, &req->iovs[i]);
 	}
-	if (req->user_proc)
-		put_task_struct(req->user_proc);
 	kfree(req->tids);
 	clear_bit(SDMA_REQ_IN_USE, &req->flags);
 }
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
index 0046ffa774fefe330ad574cc9308f70932bf9de8..0afa28508a8a3335c0e040cdf46e89c221b90de0 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.h
+++ b/drivers/staging/rdma/hfi1/user_sdma.h
@@ -68,6 +68,8 @@ struct hfi1_user_sdma_pkt_q {
 	struct user_sdma_request *reqs;
 	struct iowait busy;
 	unsigned state;
+	wait_queue_head_t wait;
+	struct mm_struct *user_mm;
 };
 
 struct hfi1_user_sdma_comp_q {