user_sdma.c 42.8 KB
Newer Older
M
Mike Marciniszyn 已提交
1
/*
2
 * Copyright(c) 2015 - 2018 Intel Corporation.
M
Mike Marciniszyn 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
 *
 * GPL LICENSE SUMMARY
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * BSD LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  - Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  - Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *  - Neither the name of Intel Corporation nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/device.h>
#include <linux/dmapool.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/io.h>
#include <linux/uio.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
63
#include <linux/string.h>
M
Mike Marciniszyn 已提交
64 65 66

#include "hfi.h"
#include "sdma.h"
67
#include "mmu_rb.h"
M
Mike Marciniszyn 已提交
68 69 70 71 72 73 74 75 76 77 78
#include "user_sdma.h"
#include "verbs.h"  /* for the headers */
#include "common.h" /* for struct hfi1_tid_info */
#include "trace.h"

static uint hfi1_sdma_comp_ring_size = 128;
module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");

static unsigned initial_pkt_count = 8;

79
static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
static int pin_vector_pages(struct user_sdma_request *req,
			    struct user_sdma_iovec *iovec);
static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
			       unsigned start, unsigned npages);
static int check_header_template(struct user_sdma_request *req,
				 struct hfi1_pkt_header *hdr, u32 lrhlen,
				 u32 datalen);
static int set_txreq_header(struct user_sdma_request *req,
			    struct user_sdma_txreq *tx, u32 datalen);
static int set_txreq_header_ahg(struct user_sdma_request *req,
				struct user_sdma_txreq *tx, u32 len);
static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
				  struct hfi1_user_sdma_comp_q *cq,
				  u16 idx, enum hfi1_sdma_comp_state state,
				  int ret);
static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
M
Mike Marciniszyn 已提交
99 100 101
static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);

static int defer_packet_queue(
102
	struct sdma_engine *sde,
103
	struct iowait_work *wait,
104
	struct sdma_txreq *txreq,
105 106
	uint seq,
	bool pkts_sent);
107 108 109 110
static void activate_packet_queue(struct iowait *wait, int reason);
static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
			   unsigned long len);
static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
D
Dean Luick 已提交
111 112
static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
			 void *arg2, bool *stop);
113 114
static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
115 116 117 118

static struct mmu_rb_ops sdma_rb_ops = {
	.filter = sdma_rb_filter,
	.insert = sdma_rb_insert,
D
Dean Luick 已提交
119
	.evict = sdma_rb_evict,
120 121 122
	.remove = sdma_rb_remove,
	.invalidate = sdma_rb_invalidate
};
M
Mike Marciniszyn 已提交
123 124 125

static int defer_packet_queue(
	struct sdma_engine *sde,
126
	struct iowait_work *wait,
M
Mike Marciniszyn 已提交
127
	struct sdma_txreq *txreq,
128 129
	uint seq,
	bool pkts_sent)
M
Mike Marciniszyn 已提交
130 131
{
	struct hfi1_user_sdma_pkt_q *pq =
132
		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
M
Mike Marciniszyn 已提交
133

134 135 136
	write_seqlock(&sde->waitlock);
	if (sdma_progress(sde, seq, txreq))
		goto eagain;
M
Mike Marciniszyn 已提交
137 138 139 140 141 142
	/*
	 * We are assuming that if the list is enqueued somewhere, it
	 * is to the dmawait list since that is the only place where
	 * it is supposed to be enqueued.
	 */
	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
143
	if (list_empty(&pq->busy.list)) {
144
		pq->busy.lock = &sde->waitlock;
145
		iowait_get_priority(&pq->busy);
146
		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
147
	}
148
	write_sequnlock(&sde->waitlock);
M
Mike Marciniszyn 已提交
149 150
	return -EBUSY;
eagain:
151
	write_sequnlock(&sde->waitlock);
M
Mike Marciniszyn 已提交
152 153 154 155 156 157 158
	return -EAGAIN;
}

static void activate_packet_queue(struct iowait *wait, int reason)
{
	struct hfi1_user_sdma_pkt_q *pq =
		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
159
	pq->busy.lock = NULL;
M
Mike Marciniszyn 已提交
160 161 162 163
	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
	wake_up(&wait->wait_dma);
};

164 165
int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
				struct hfi1_filedata *fd)
M
Mike Marciniszyn 已提交
166
{
167
	int ret = -ENOMEM;
M
Mike Marciniszyn 已提交
168 169 170 171 172
	char buf[64];
	struct hfi1_devdata *dd;
	struct hfi1_user_sdma_comp_q *cq;
	struct hfi1_user_sdma_pkt_q *pq;

173 174
	if (!uctxt || !fd)
		return -EBADF;
M
Mike Marciniszyn 已提交
175

176 177
	if (!hfi1_sdma_comp_ring_size)
		return -EINVAL;
M
Mike Marciniszyn 已提交
178 179 180 181

	dd = uctxt->dd;

	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
182
	if (!pq)
183
		return -ENOMEM;
M
Mike Marciniszyn 已提交
184 185
	pq->dd = dd;
	pq->ctxt = uctxt->ctxt;
186
	pq->subctxt = fd->subctxt;
M
Mike Marciniszyn 已提交
187 188
	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
	atomic_set(&pq->n_reqs, 0);
189
	init_waitqueue_head(&pq->wait);
D
Dean Luick 已提交
190
	atomic_set(&pq->n_locked, 0);
I
Ira Weiny 已提交
191
	pq->mm = fd->mm;
M
Mike Marciniszyn 已提交
192

193
	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
194
		    activate_packet_queue, NULL, NULL);
M
Mike Marciniszyn 已提交
195
	pq->reqidx = 0;
196 197 198 199 200 201 202 203 204 205 206 207 208

	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
			   sizeof(*pq->reqs),
			   GFP_KERNEL);
	if (!pq->reqs)
		goto pq_reqs_nomem;

	pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
				 sizeof(*pq->req_in_use),
				 GFP_KERNEL);
	if (!pq->req_in_use)
		goto pq_reqs_no_in_use;

M
Mike Marciniszyn 已提交
209
	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
210
		 fd->subctxt);
M
Mike Marciniszyn 已提交
211
	pq->txreq_cache = kmem_cache_create(buf,
212
					    sizeof(struct user_sdma_txreq),
M
Mike Marciniszyn 已提交
213 214
					    L1_CACHE_BYTES,
					    SLAB_HWCACHE_ALIGN,
215
					    NULL);
M
Mike Marciniszyn 已提交
216 217 218 219 220
	if (!pq->txreq_cache) {
		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
			   uctxt->ctxt);
		goto pq_txreq_nomem;
	}
221

M
Mike Marciniszyn 已提交
222
	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
223
	if (!cq)
M
Mike Marciniszyn 已提交
224 225
		goto cq_nomem;

226 227
	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
				 * hfi1_sdma_comp_ring_size));
228
	if (!cq->comps)
M
Mike Marciniszyn 已提交
229
		goto cq_comps_nomem;
230

M
Mike Marciniszyn 已提交
231 232
	cq->nentries = hfi1_sdma_comp_ring_size;

233 234
	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
				   &pq->handler);
235 236
	if (ret) {
		dd_dev_err(dd, "Failed to register with MMU %d", ret);
237
		goto pq_mmu_fail;
238 239
	}

240
	rcu_assign_pointer(fd->pq, pq);
241 242 243 244 245 246
	fd->cq = cq;

	return 0;

pq_mmu_fail:
	vfree(cq->comps);
M
Mike Marciniszyn 已提交
247 248 249 250 251
cq_comps_nomem:
	kfree(cq);
cq_nomem:
	kmem_cache_destroy(pq->txreq_cache);
pq_txreq_nomem:
252 253
	kfree(pq->req_in_use);
pq_reqs_no_in_use:
M
Mike Marciniszyn 已提交
254 255 256
	kfree(pq->reqs);
pq_reqs_nomem:
	kfree(pq);
257

M
Mike Marciniszyn 已提交
258 259 260
	return ret;
}

261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
{
	unsigned long flags;
	seqlock_t *lock = pq->busy.lock;

	if (!lock)
		return;
	write_seqlock_irqsave(lock, flags);
	if (!list_empty(&pq->busy.list)) {
		list_del_init(&pq->busy.list);
		pq->busy.lock = NULL;
	}
	write_sequnlock_irqrestore(lock, flags);
}

276 277
int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
			       struct hfi1_ctxtdata *uctxt)
M
Mike Marciniszyn 已提交
278 279 280
{
	struct hfi1_user_sdma_pkt_q *pq;

281 282
	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);

283 284 285
	spin_lock(&fd->pq_rcu_lock);
	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
				    lockdep_is_held(&fd->pq_rcu_lock));
M
Mike Marciniszyn 已提交
286
	if (pq) {
287 288 289 290
		rcu_assign_pointer(fd->pq, NULL);
		spin_unlock(&fd->pq_rcu_lock);
		synchronize_srcu(&fd->pq_srcu);
		/* at this point there can be no more new requests */
291 292
		if (pq->handler)
			hfi1_mmu_rb_unregister(pq->handler);
M
Mike Marciniszyn 已提交
293
		iowait_sdma_drain(&pq->busy);
294 295 296
		/* Wait until all requests have been freed. */
		wait_event_interruptible(
			pq->wait,
297
			!atomic_read(&pq->n_reqs));
298
		kfree(pq->reqs);
299
		kfree(pq->req_in_use);
300
		kmem_cache_destroy(pq->txreq_cache);
301
		flush_pq_iowait(pq);
M
Mike Marciniszyn 已提交
302
		kfree(pq);
303 304
	} else {
		spin_unlock(&fd->pq_rcu_lock);
M
Mike Marciniszyn 已提交
305 306
	}
	if (fd->cq) {
307
		vfree(fd->cq->comps);
M
Mike Marciniszyn 已提交
308 309 310 311 312 313
		kfree(fd->cq);
		fd->cq = NULL;
	}
	return 0;
}

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
static u8 dlid_to_selector(u16 dlid)
{
	static u8 mapping[256];
	static int initialized;
	static u8 next;
	int hash;

	if (!initialized) {
		memset(mapping, 0xFF, 256);
		initialized = 1;
	}

	hash = ((dlid >> 8) ^ dlid) & 0xFF;
	if (mapping[hash] == 0xFF) {
		mapping[hash] = next;
		next = (next + 1) & 0x7F;
	}

	return mapping[hash];
}

335 336 337 338 339 340 341
/**
 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 * @fd: valid file descriptor
 * @iovec: array of io vectors to process
 * @dim: overall iovec array size
 * @count: number of io vector array entries processed
 */
342 343 344
int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
				   struct iovec *iovec, unsigned long dim,
				   unsigned long *count)
M
Mike Marciniszyn 已提交
345
{
346
	int ret = 0, i;
347
	struct hfi1_ctxtdata *uctxt = fd->uctxt;
348 349
	struct hfi1_user_sdma_pkt_q *pq =
		srcu_dereference(fd->pq, &fd->pq_srcu);
350
	struct hfi1_user_sdma_comp_q *cq = fd->cq;
M
Mike Marciniszyn 已提交
351 352 353 354 355 356
	struct hfi1_devdata *dd = pq->dd;
	unsigned long idx = 0;
	u8 pcount = initial_pkt_count;
	struct sdma_req_info info;
	struct user_sdma_request *req;
	u8 opcode, sc, vl;
357 358
	u16 pkey;
	u32 slid;
359
	u16 dlid;
360
	u32 selector;
M
Mike Marciniszyn 已提交
361 362 363 364 365

	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
		hfi1_cdbg(
		   SDMA,
		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
366
		   dd->unit, uctxt->ctxt, fd->subctxt,
M
Mike Marciniszyn 已提交
367
		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
368
		return -EINVAL;
M
Mike Marciniszyn 已提交
369 370 371 372
	}
	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
	if (ret) {
		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
373
			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
374
		return -EFAULT;
M
Mike Marciniszyn 已提交
375
	}
376

377
	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
M
Mike Marciniszyn 已提交
378
				     (u16 *)&info);
379 380 381 382 383 384 385
	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
		hfi1_cdbg(SDMA,
			  "[%u:%u:%u:%u] Invalid comp index",
			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
		return -EINVAL;
	}

386 387 388 389 390 391 392 393 394 395 396 397
	/*
	 * Sanity check the header io vector count.  Need at least 1 vector
	 * (header) and cannot be larger than the actual io vector count.
	 */
	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
		hfi1_cdbg(SDMA,
			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
			  req_iovcnt(info.ctrl), dim);
		return -EINVAL;
	}

M
Mike Marciniszyn 已提交
398 399 400
	if (!info.fragsize) {
		hfi1_cdbg(SDMA,
			  "[%u:%u:%u:%u] Request does not specify fragsize",
401
			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
402
		return -EINVAL;
M
Mike Marciniszyn 已提交
403
	}
404 405 406 407 408 409 410 411

	/* Try to claim the request. */
	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
			  dd->unit, uctxt->ctxt, fd->subctxt,
			  info.comp_idx);
		return -EBADSLT;
	}
M
Mike Marciniszyn 已提交
412
	/*
413
	 * All safety checks have been done and this request has been claimed.
M
Mike Marciniszyn 已提交
414
	 */
415 416
	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
					     info.comp_idx);
M
Mike Marciniszyn 已提交
417
	req = pq->reqs + info.comp_idx;
418
	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
419
	req->data_len  = 0;
M
Mike Marciniszyn 已提交
420 421
	req->pq = pq;
	req->cq = cq;
422
	req->ahg_idx = -1;
423 424 425 426 427 428
	req->iov_idx = 0;
	req->sent = 0;
	req->seqnum = 0;
	req->seqcomp = 0;
	req->seqsubmitted = 0;
	req->tids = NULL;
429
	req->has_error = 0;
M
Mike Marciniszyn 已提交
430
	INIT_LIST_HEAD(&req->txps);
431

M
Mike Marciniszyn 已提交
432 433
	memcpy(&req->info, &info, sizeof(info));

434 435 436
	/* The request is initialized, count it */
	atomic_inc(&pq->n_reqs);

437 438 439 440 441 442 443 444
	if (req_opcode(info.ctrl) == EXPECTED) {
		/* expected must have a TID info and at least one data vector */
		if (req->data_iovs < 2) {
			SDMA_DBG(req,
				 "Not enough vectors for expected request");
			ret = -EINVAL;
			goto free_req;
		}
M
Mike Marciniszyn 已提交
445
		req->data_iovs--;
446
	}
M
Mike Marciniszyn 已提交
447 448 449 450

	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
			 MAX_VECTORS_PER_REQ);
451 452
		ret = -EINVAL;
		goto free_req;
M
Mike Marciniszyn 已提交
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
	}
	/* Copy the header from the user buffer */
	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
			     sizeof(req->hdr));
	if (ret) {
		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
		ret = -EFAULT;
		goto free_req;
	}

	/* If Static rate control is not enabled, sanitize the header. */
	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
		req->hdr.pbc[2] = 0;

	/* Validate the opcode. Do not trust packets from user space blindly. */
	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
	if ((opcode & USER_OPCODE_CHECK_MASK) !=
	     USER_OPCODE_CHECK_VAL) {
		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
		ret = -EINVAL;
		goto free_req;
	}
	/*
	 * Validate the vl. Do not trust packets from user space blindly.
	 * VL comes from PBC, SC comes from LRH, and the VL needs to
	 * match the SC look up.
	 */
	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
	if (vl >= dd->pport->vls_operational ||
	    vl != sc_to_vlt(dd, sc)) {
		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
		ret = -EINVAL;
		goto free_req;
	}

490
	/* Checking P_KEY for requests from user-space */
491 492 493
	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
	slid = be16_to_cpu(req->hdr.lrh[3]);
	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
494 495 496 497
		ret = -EINVAL;
		goto free_req;
	}

M
Mike Marciniszyn 已提交
498 499 500 501 502 503 504 505 506 507 508 509
	/*
	 * Also should check the BTH.lnh. If it says the next header is GRH then
	 * the RXE parsing will be off and will land in the middle of the KDETH
	 * or miss it entirely.
	 */
	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
		SDMA_DBG(req, "User tried to pass in a GRH");
		ret = -EINVAL;
		goto free_req;
	}

	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
510 511 512 513
	/*
	 * Calculate the initial TID offset based on the values of
	 * KDETH.OFFSET and KDETH.OM that are passed in.
	 */
M
Mike Marciniszyn 已提交
514 515 516
	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
		 KDETH_OM_LARGE : KDETH_OM_SMALL);
517 518
	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
					       info.comp_idx, req->tidoffset);
M
Mike Marciniszyn 已提交
519 520 521
	idx++;

	/* Save all the IO vector structures */
522
	for (i = 0; i < req->data_iovs; i++) {
523
		req->iovs[i].offset = 0;
524
		INIT_LIST_HEAD(&req->iovs[i].list);
525 526 527
		memcpy(&req->iovs[i].iov,
		       iovec + idx++,
		       sizeof(req->iovs[i].iov));
528 529
		ret = pin_vector_pages(req, &req->iovs[i]);
		if (ret) {
530
			req->data_iovs = i;
531 532
			goto free_req;
		}
533
		req->data_len += req->iovs[i].iov.iov_len;
M
Mike Marciniszyn 已提交
534
	}
535 536
	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
					 info.comp_idx, req->data_len);
M
Mike Marciniszyn 已提交
537 538 539 540 541 542 543 544 545 546 547 548
	if (pcount > req->info.npkts)
		pcount = req->info.npkts;
	/*
	 * Copy any TID info
	 * User space will provide the TID info only when the
	 * request type is EXPECTED. This is true even if there is
	 * only one packet in the request and the header is already
	 * setup. The reason for the singular TID case is that the
	 * driver needs to perform safety checks.
	 */
	if (req_opcode(req->info.ctrl) == EXPECTED) {
		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
549
		u32 *tmp;
M
Mike Marciniszyn 已提交
550 551 552 553 554

		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
			ret = -EINVAL;
			goto free_req;
		}
555

M
Mike Marciniszyn 已提交
556 557 558 559 560 561
		/*
		 * We have to copy all of the tids because they may vary
		 * in size and, therefore, the TID count might not be
		 * equal to the pkt count. However, there is no way to
		 * tell at this point.
		 */
562 563 564 565
		tmp = memdup_user(iovec[idx].iov_base,
				  ntids * sizeof(*req->tids));
		if (IS_ERR(tmp)) {
			ret = PTR_ERR(tmp);
M
Mike Marciniszyn 已提交
566 567 568 569
			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
				 ntids, ret);
			goto free_req;
		}
570
		req->tids = tmp;
M
Mike Marciniszyn 已提交
571
		req->n_tids = ntids;
572
		req->tididx = 0;
M
Mike Marciniszyn 已提交
573 574 575
		idx++;
	}

576 577
	dlid = be16_to_cpu(req->hdr.lrh[1]);
	selector = dlid_to_selector(dlid);
578 579
	selector += uctxt->ctxt + fd->subctxt;
	req->sde = sdma_select_user_engine(dd, selector, vl);
580

M
Mike Marciniszyn 已提交
581 582 583 584 585 586
	if (!req->sde || !sdma_running(req->sde)) {
		ret = -ECOMM;
		goto free_req;
	}

	/* We don't need an AHG entry if the request contains only one packet */
587 588
	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
		req->ahg_idx = sdma_ahg_alloc(req->sde);
M
Mike Marciniszyn 已提交
589

590
	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
591
	pq->state = SDMA_PKT_Q_ACTIVE;
M
Mike Marciniszyn 已提交
592

593 594 595 596 597 598
	/*
	 * This is a somewhat blocking send implementation.
	 * The driver will block the caller until all packets of the
	 * request have been submitted to the SDMA engine. However, it
	 * will not wait for send completions.
	 */
599
	while (req->seqsubmitted != req->info.npkts) {
600 601
		ret = user_sdma_send_pkts(req, pcount);
		if (ret < 0) {
602 603
			if (ret != -EBUSY)
				goto free_req;
604
			if (wait_event_interruptible_timeout(
605
				pq->busy.wait_dma,
606
				pq->state == SDMA_PKT_Q_ACTIVE,
607
				msecs_to_jiffies(
608 609
					SDMA_IOWAIT_TIMEOUT)) <= 0)
				flush_pq_iowait(pq);
M
Mike Marciniszyn 已提交
610 611 612
		}
	}
	*count += idx;
613
	return 0;
M
Mike Marciniszyn 已提交
614
free_req:
615 616 617 618 619 620 621 622 623 624
	/*
	 * If the submitted seqsubmitted == npkts, the completion routine
	 * controls the final state.  If sequbmitted < npkts, wait for any
	 * outstanding packets to finish before cleaning up.
	 */
	if (req->seqsubmitted < req->info.npkts) {
		if (req->seqsubmitted)
			wait_event(pq->busy.wait_dma,
				   (req->seqcomp == req->seqsubmitted - 1));
		user_sdma_free_request(req, true);
625
		pq_update(pq);
626 627
		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
	}
M
Mike Marciniszyn 已提交
628 629 630 631
	return ret;
}

static inline u32 compute_data_length(struct user_sdma_request *req,
632
				      struct user_sdma_txreq *tx)
M
Mike Marciniszyn 已提交
633 634 635 636 637 638
{
	/*
	 * Determine the proper size of the packet data.
	 * The size of the data of the first packet is in the header
	 * template. However, it includes the header and ICRC, which need
	 * to be subtracted.
639 640 641 642
	 * The minimum representable packet data length in a header is 4 bytes,
	 * therefore, when the data length request is less than 4 bytes, there's
	 * only one packet, and the packet data length is equal to that of the
	 * request data length.
M
Mike Marciniszyn 已提交
643 644 645 646 647 648
	 * The size of the remaining packets is the minimum of the frag
	 * size (MTU) or remaining data in the request.
	 */
	u32 len;

	if (!req->seqnum) {
649 650 651 652 653
		if (req->data_len < sizeof(u32))
			len = req->data_len;
		else
			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
			       (sizeof(tx->hdr) - 4));
M
Mike Marciniszyn 已提交
654 655 656
	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
			PAGE_SIZE;
657 658 659 660
		/*
		 * Get the data length based on the remaining space in the
		 * TID pair.
		 */
M
Mike Marciniszyn 已提交
661 662 663 664 665 666 667 668 669
		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
		/* If we've filled up the TID pair, move to the next one. */
		if (unlikely(!len) && ++req->tididx < req->n_tids &&
		    req->tids[req->tididx]) {
			tidlen = EXP_TID_GET(req->tids[req->tididx],
					     LEN) * PAGE_SIZE;
			req->tidoffset = 0;
			len = min_t(u32, tidlen, req->info.fragsize);
		}
670 671
		/*
		 * Since the TID pairs map entire pages, make sure that we
M
Mike Marciniszyn 已提交
672
		 * are not going to try to send more data that we have
673 674
		 * remaining.
		 */
M
Mike Marciniszyn 已提交
675
		len = min(len, req->data_len - req->sent);
676
	} else {
M
Mike Marciniszyn 已提交
677
		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
678
	}
679 680 681 682 683
	trace_hfi1_sdma_user_compute_length(req->pq->dd,
					    req->pq->ctxt,
					    req->pq->subctxt,
					    req->info.comp_idx,
					    len);
M
Mike Marciniszyn 已提交
684 685 686
	return len;
}

687 688 689 690 691 692 693
static inline u32 pad_len(u32 len)
{
	if (len & (sizeof(u32) - 1))
		len += sizeof(u32) - (len & (sizeof(u32) - 1));
	return len;
}

M
Mike Marciniszyn 已提交
694 695 696 697 698 699
static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
{
	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
}

700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
static int user_sdma_txadd_ahg(struct user_sdma_request *req,
			       struct user_sdma_txreq *tx,
			       u32 datalen)
{
	int ret;
	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
	struct hfi1_user_sdma_pkt_q *pq = req->pq;

	/*
	 * Copy the request header into the tx header
	 * because the HW needs a cacheline-aligned
	 * address.
	 * This copy can be optimized out if the hdr
	 * member of user_sdma_request were also
	 * cacheline aligned.
	 */
	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
	if (PBC2LRH(pbclen) != lrhlen) {
		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
	}
	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
	if (ret)
		return ret;
	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
			      sizeof(tx->hdr) + datalen, req->ahg_idx,
			      0, NULL, 0, user_sdma_txreq_cb);
	if (ret)
		return ret;
	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
	if (ret)
		sdma_txclean(pq->dd, &tx->txreq);
	return ret;
}

static int user_sdma_txadd(struct user_sdma_request *req,
			   struct user_sdma_txreq *tx,
			   struct user_sdma_iovec *iovec, u32 datalen,
			   u32 *queued_ptr, u32 *data_sent_ptr,
			   u64 *iov_offset_ptr)
{
	int ret;
	unsigned int pageidx, len;
	unsigned long base, offset;
	u64 iov_offset = *iov_offset_ptr;
	u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
	struct hfi1_user_sdma_pkt_q *pq = req->pq;

	base = (unsigned long)iovec->iov.iov_base;
	offset = offset_in_page(base + iovec->offset + iov_offset);
	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
		   PAGE_SHIFT);
	len = offset + req->info.fragsize > PAGE_SIZE ?
		PAGE_SIZE - offset : req->info.fragsize;
	len = min((datalen - queued), len);
	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
			      offset, len);
	if (ret) {
		SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
		return ret;
	}
	iov_offset += len;
	queued += len;
	data_sent += len;
	if (unlikely(queued < datalen && pageidx == iovec->npages &&
		     req->iov_idx < req->data_iovs - 1)) {
		iovec->offset += iov_offset;
		iovec = &req->iovs[++req->iov_idx];
		iov_offset = 0;
	}

	*queued_ptr = queued;
	*data_sent_ptr = data_sent;
	*iov_offset_ptr = iov_offset;
	return ret;
}

778
static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
M
Mike Marciniszyn 已提交
779
{
780 781
	int ret = 0;
	u16 count;
M
Mike Marciniszyn 已提交
782 783 784 785 786
	unsigned npkts = 0;
	struct user_sdma_txreq *tx = NULL;
	struct hfi1_user_sdma_pkt_q *pq = NULL;
	struct user_sdma_iovec *iovec = NULL;

787 788
	if (!req->pq)
		return -EINVAL;
M
Mike Marciniszyn 已提交
789 790 791

	pq = req->pq;

792
	/* If tx completion has reported an error, we are done. */
793
	if (READ_ONCE(req->has_error))
794 795
		return -EFAULT;

M
Mike Marciniszyn 已提交
796 797 798 799 800 801
	/*
	 * Check if we might have sent the entire request already
	 */
	if (unlikely(req->seqnum == req->info.npkts)) {
		if (!list_empty(&req->txps))
			goto dosend;
802
		return ret;
M
Mike Marciniszyn 已提交
803 804 805 806 807 808 809 810 811 812 813 814 815 816
	}

	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
		maxpkts = req->info.npkts - req->seqnum;

	while (npkts < maxpkts) {
		u32 datalen = 0, queued = 0, data_sent = 0;
		u64 iov_offset = 0;

		/*
		 * Check whether any of the completions have come back
		 * with errors. If so, we are not going to process any
		 * more packets from this request.
		 */
817
		if (READ_ONCE(req->has_error))
818
			return -EFAULT;
M
Mike Marciniszyn 已提交
819 820

		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
821 822 823
		if (!tx)
			return -ENOMEM;

M
Mike Marciniszyn 已提交
824 825
		tx->flags = 0;
		tx->req = req;
826
		INIT_LIST_HEAD(&tx->list);
M
Mike Marciniszyn 已提交
827

828 829 830 831
		/*
		 * For the last packet set the ACK request
		 * and disable header suppression.
		 */
M
Mike Marciniszyn 已提交
832
		if (req->seqnum == req->info.npkts - 1)
833 834
			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
				      TXREQ_FLAGS_REQ_DISABLE_SH);
M
Mike Marciniszyn 已提交
835 836 837 838 839 840 841 842

		/*
		 * Calculate the payload size - this is min of the fragment
		 * (MTU) size or the remaining bytes in the request but only
		 * if we have payload data.
		 */
		if (req->data_len) {
			iovec = &req->iovs[req->iov_idx];
843
			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
M
Mike Marciniszyn 已提交
844 845
				if (++req->iov_idx == req->data_iovs) {
					ret = -EFAULT;
846
					goto free_tx;
M
Mike Marciniszyn 已提交
847 848 849 850 851 852
				}
				iovec = &req->iovs[req->iov_idx];
				WARN_ON(iovec->offset);
			}

			datalen = compute_data_length(req, tx);
853 854 855 856 857 858 859 860 861

			/*
			 * Disable header suppression for the payload <= 8DWS.
			 * If there is an uncorrectable error in the receive
			 * data FIFO when the received payload size is less than
			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
			 * not reported.There is set RHF.EccErr if the header
			 * is not suppressed.
			 */
M
Mike Marciniszyn 已提交
862 863 864 865 866
			if (!datalen) {
				SDMA_DBG(req,
					 "Request has data but pkt len is 0");
				ret = -EFAULT;
				goto free_tx;
867 868
			} else if (datalen <= 32) {
				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
M
Mike Marciniszyn 已提交
869 870 871
			}
		}

872
		if (req->ahg_idx >= 0) {
M
Mike Marciniszyn 已提交
873
			if (!req->seqnum) {
874
				ret = user_sdma_txadd_ahg(req, tx, datalen);
875 876
				if (ret)
					goto free_tx;
M
Mike Marciniszyn 已提交
877 878 879 880 881
			} else {
				int changes;

				changes = set_txreq_header_ahg(req, tx,
							       datalen);
882 883
				if (changes < 0) {
					ret = changes;
M
Mike Marciniszyn 已提交
884
					goto free_tx;
885
				}
M
Mike Marciniszyn 已提交
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
			}
		} else {
			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
					  datalen, user_sdma_txreq_cb);
			if (ret)
				goto free_tx;
			/*
			 * Modify the header for this packet. This only needs
			 * to be done if we are not going to use AHG. Otherwise,
			 * the HW will do it based on the changes we gave it
			 * during sdma_txinit_ahg().
			 */
			ret = set_txreq_header(req, tx, datalen);
			if (ret)
				goto free_txreq;
		}

		/*
		 * If the request contains any data vectors, add up to
		 * fragsize bytes to the descriptor.
		 */
		while (queued < datalen &&
		       (req->sent + data_sent) < req->data_len) {
909 910 911
			ret = user_sdma_txadd(req, tx, iovec, datalen,
					      &queued, &data_sent, &iov_offset);
			if (ret)
M
Mike Marciniszyn 已提交
912 913 914 915 916 917 918 919 920 921
				goto free_txreq;
		}
		/*
		 * The txreq was submitted successfully so we can update
		 * the counters.
		 */
		req->koffset += datalen;
		if (req_opcode(req->info.ctrl) == EXPECTED)
			req->tidoffset += datalen;
		req->sent += data_sent;
922 923
		if (req->data_len)
			iovec->offset += iov_offset;
924
		list_add_tail(&tx->txreq.list, &req->txps);
M
Mike Marciniszyn 已提交
925 926 927 928 929 930 931 932 933
		/*
		 * It is important to increment this here as it is used to
		 * generate the BTH.PSN and, therefore, can't be bulk-updated
		 * outside of the loop.
		 */
		tx->seqnum = req->seqnum++;
		npkts++;
	}
dosend:
934 935 936
	ret = sdma_send_txlist(req->sde,
			       iowait_get_ib_work(&pq->busy),
			       &req->txps, &count);
937 938 939 940 941 942 943 944
	req->seqsubmitted += count;
	if (req->seqsubmitted == req->info.npkts) {
		/*
		 * The txreq has already been submitted to the HW queue
		 * so we can free the AHG entry now. Corruption will not
		 * happen due to the sequential manner in which
		 * descriptors are processed.
		 */
945
		if (req->ahg_idx >= 0)
946
			sdma_ahg_free(req->sde, req->ahg_idx);
947
	}
948 949
	return ret;

M
Mike Marciniszyn 已提交
950 951 952 953 954 955 956
free_txreq:
	sdma_txclean(pq->dd, &tx->txreq);
free_tx:
	kmem_cache_free(pq->txreq_cache, tx);
	return ret;
}

957 958
static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
{
D
Dean Luick 已提交
959
	struct evict_data evict_data;
960

D
Dean Luick 已提交
961 962 963 964
	evict_data.cleared = 0;
	evict_data.target = npages;
	hfi1_mmu_rb_evict(pq->handler, &evict_data);
	return evict_data.cleared;
965
}
966

967 968 969 970 971 972 973 974 975 976
static int pin_sdma_pages(struct user_sdma_request *req,
			  struct user_sdma_iovec *iovec,
			  struct sdma_mmu_node *node,
			  int npages)
{
	int pinned, cleared;
	struct page **pages;
	struct hfi1_user_sdma_pkt_q *pq = req->pq;

	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
977
	if (!pages)
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
		return -ENOMEM;
	memcpy(pages, node->pages, node->npages * sizeof(*pages));

	npages -= node->npages;
retry:
	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
				atomic_read(&pq->n_locked), npages)) {
		cleared = sdma_cache_evict(pq, npages);
		if (cleared >= npages)
			goto retry;
	}
	pinned = hfi1_acquire_user_pages(pq->mm,
					 ((unsigned long)iovec->iov.iov_base +
					 (node->npages * PAGE_SIZE)), npages, 0,
					 pages + node->npages);
	if (pinned < 0) {
		kfree(pages);
		return pinned;
	}
	if (pinned != npages) {
		unpin_vector_pages(pq->mm, pages, node->npages, pinned);
		return -EFAULT;
	}
	kfree(node->pages);
	node->rb.len = iovec->iov.iov_len;
	node->pages = pages;
	atomic_add(pinned, &pq->n_locked);
	return pinned;
}

1008 1009 1010 1011 1012 1013 1014 1015
static void unpin_sdma_pages(struct sdma_mmu_node *node)
{
	if (node->npages) {
		unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
		atomic_sub(node->npages, &node->pq->n_locked);
	}
}

M
Mike Marciniszyn 已提交
1016
static int pin_vector_pages(struct user_sdma_request *req,
I
Ira Weiny 已提交
1017 1018
			    struct user_sdma_iovec *iovec)
{
1019
	int ret = 0, pinned, npages;
1020 1021 1022
	struct hfi1_user_sdma_pkt_q *pq = req->pq;
	struct sdma_mmu_node *node = NULL;
	struct mmu_rb_node *rb_node;
1023
	struct iovec *iov;
1024 1025 1026 1027 1028 1029 1030 1031
	bool extracted;

	extracted =
		hfi1_mmu_rb_remove_unless_exact(pq->handler,
						(unsigned long)
						iovec->iov.iov_base,
						iovec->iov.iov_len, &rb_node);
	if (rb_node) {
1032
		node = container_of(rb_node, struct sdma_mmu_node, rb);
1033 1034 1035 1036 1037 1038 1039 1040
		if (!extracted) {
			atomic_inc(&node->refcount);
			iovec->pages = node->pages;
			iovec->npages = node->npages;
			iovec->node = node;
			return 0;
		}
	}
1041 1042 1043 1044 1045

	if (!node) {
		node = kzalloc(sizeof(*node), GFP_KERNEL);
		if (!node)
			return -ENOMEM;
1046

1047
		node->rb.addr = (unsigned long)iovec->iov.iov_base;
1048
		node->pq = pq;
1049
		atomic_set(&node->refcount, 0);
M
Mike Marciniszyn 已提交
1050
	}
1051

1052 1053
	iov = &iovec->iov;
	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1054
	if (node->npages < npages) {
1055
		pinned = pin_sdma_pages(req, iovec, node, npages);
1056 1057 1058 1059 1060 1061 1062 1063 1064
		if (pinned < 0) {
			ret = pinned;
			goto bail;
		}
		node->npages += pinned;
		npages = node->npages;
	}
	iovec->pages = node->pages;
	iovec->npages = npages;
1065
	iovec->node = node;
1066

1067
	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1068
	if (ret) {
1069
		iovec->node = NULL;
1070
		goto bail;
M
Mike Marciniszyn 已提交
1071
	}
1072
	return 0;
1073
bail:
1074
	unpin_sdma_pages(node);
1075
	kfree(node);
1076
	return ret;
M
Mike Marciniszyn 已提交
1077 1078
}

1079
static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1080
			       unsigned start, unsigned npages)
M
Mike Marciniszyn 已提交
1081
{
I
Ira Weiny 已提交
1082
	hfi1_release_user_pages(mm, pages + start, npages, false);
1083
	kfree(pages);
M
Mike Marciniszyn 已提交
1084 1085 1086 1087 1088 1089 1090 1091 1092
}

static int check_header_template(struct user_sdma_request *req,
				 struct hfi1_pkt_header *hdr, u32 lrhlen,
				 u32 datalen)
{
	/*
	 * Perform safety checks for any type of packet:
	 *    - transfer size is multiple of 64bytes
1093
	 *    - packet length is multiple of 4 bytes
M
Mike Marciniszyn 已提交
1094 1095 1096 1097 1098 1099
	 *    - packet length is not larger than MTU size
	 *
	 * These checks are only done for the first packet of the
	 * transfer since the header is "given" to us by user space.
	 * For the remainder of the packets we compute the values.
	 */
1100
	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
M
Mike Marciniszyn 已提交
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
		return -EINVAL;

	if (req_opcode(req->info.ctrl) == EXPECTED) {
		/*
		 * The header is checked only on the first packet. Furthermore,
		 * we ensure that at least one TID entry is copied when the
		 * request is submitted. Therefore, we don't have to verify that
		 * tididx points to something sane.
		 */
		u32 tidval = req->tids[req->tididx],
			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
			tididx = EXP_TID_GET(tidval, IDX),
			tidctrl = EXP_TID_GET(tidval, CTRL),
			tidoff;
		__le32 kval = hdr->kdeth.ver_tid_offset;

		tidoff = KDETH_GET(kval, OFFSET) *
			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
			   KDETH_OM_LARGE : KDETH_OM_SMALL);
		/*
		 * Expected receive packets have the following
		 * additional checks:
		 *     - offset is not larger than the TID size
		 *     - TIDCtrl values match between header and TID array
		 *     - TID indexes match between header and TID array
		 */
		if ((tidoff + datalen > tidlen) ||
		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
		    KDETH_GET(kval, TID) != tididx)
			return -EINVAL;
	}
	return 0;
}

/*
 * Correctly set the BTH.PSN field based on type of
 * transfer - eager packets can just increment the PSN but
 * expected packets encode generation and sequence in the
 * BTH.PSN field so just incrementing will result in errors.
 */
static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
{
	u32 val = be32_to_cpu(bthpsn),
		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
			0xffffffull),
		psn = val & mask;
	if (expct)
1149 1150
		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
M
Mike Marciniszyn 已提交
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
	else
		psn = psn + frags;
	return psn & mask;
}

static int set_txreq_header(struct user_sdma_request *req,
			    struct user_sdma_txreq *tx, u32 datalen)
{
	struct hfi1_user_sdma_pkt_q *pq = req->pq;
	struct hfi1_pkt_header *hdr = &tx->hdr;
1161
	u8 omfactor; /* KDETH.OM */
M
Mike Marciniszyn 已提交
1162 1163
	u16 pbclen;
	int ret;
1164
	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
M
Mike Marciniszyn 已提交
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213

	/* Copy the header template to the request before modification */
	memcpy(hdr, &req->hdr, sizeof(*hdr));

	/*
	 * Check if the PBC and LRH length are mismatched. If so
	 * adjust both in the header.
	 */
	pbclen = le16_to_cpu(hdr->pbc[0]);
	if (PBC2LRH(pbclen) != lrhlen) {
		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
		hdr->pbc[0] = cpu_to_le16(pbclen);
		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
		/*
		 * Third packet
		 * This is the first packet in the sequence that has
		 * a "static" size that can be used for the rest of
		 * the packets (besides the last one).
		 */
		if (unlikely(req->seqnum == 2)) {
			/*
			 * From this point on the lengths in both the
			 * PBC and LRH are the same until the last
			 * packet.
			 * Adjust the template so we don't have to update
			 * every packet
			 */
			req->hdr.pbc[0] = hdr->pbc[0];
			req->hdr.lrh[2] = hdr->lrh[2];
		}
	}
	/*
	 * We only have to modify the header if this is not the
	 * first packet in the request. Otherwise, we use the
	 * header given to us.
	 */
	if (unlikely(!req->seqnum)) {
		ret = check_header_template(req, hdr, lrhlen, datalen);
		if (ret)
			return ret;
		goto done;
	}

	hdr->bth[2] = cpu_to_be32(
		set_pkt_bth_psn(hdr->bth[2],
				(req_opcode(req->info.ctrl) == EXPECTED),
				req->seqnum));

	/* Set ACK request on last packet */
1214
	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1215
		hdr->bth[2] |= cpu_to_be32(1UL << 31);
M
Mike Marciniszyn 已提交
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228

	/* Set the new offset */
	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
	/* Expected packets have to fill in the new TID information */
	if (req_opcode(req->info.ctrl) == EXPECTED) {
		tidval = req->tids[req->tididx];
		/*
		 * If the offset puts us at the end of the current TID,
		 * advance everything.
		 */
		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
					 PAGE_SIZE)) {
			req->tidoffset = 0;
1229 1230 1231 1232
			/*
			 * Since we don't copy all the TIDs, all at once,
			 * we have to check again.
			 */
M
Mike Marciniszyn 已提交
1233 1234 1235 1236 1237 1238
			if (++req->tididx > req->n_tids - 1 ||
			    !req->tids[req->tididx]) {
				return -EINVAL;
			}
			tidval = req->tids[req->tididx];
		}
1239 1240 1241
		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
			KDETH_OM_SMALL_SHIFT;
M
Mike Marciniszyn 已提交
1242 1243 1244 1245 1246 1247
		/* Set KDETH.TIDCtrl based on value for this TID. */
		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
			  EXP_TID_GET(tidval, CTRL));
		/* Set KDETH.TID based on value for this TID */
		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
			  EXP_TID_GET(tidval, IDX));
1248 1249
		/* Clear KDETH.SH when DISABLE_SH flag is set */
		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
M
Mike Marciniszyn 已提交
1250 1251 1252 1253 1254
			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
		/*
		 * Set the KDETH.OFFSET and KDETH.OM based on size of
		 * transfer.
		 */
1255 1256 1257 1258
		trace_hfi1_sdma_user_tid_info(
			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
			req->tidoffset, req->tidoffset >> omfactor,
			omfactor != KDETH_OM_SMALL_SHIFT);
M
Mike Marciniszyn 已提交
1259
		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1260
			  req->tidoffset >> omfactor);
M
Mike Marciniszyn 已提交
1261
		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1262
			  omfactor != KDETH_OM_SMALL_SHIFT);
M
Mike Marciniszyn 已提交
1263 1264 1265 1266 1267 1268 1269 1270
	}
done:
	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
				    req->info.comp_idx, hdr, tidval);
	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
}

static int set_txreq_header_ahg(struct user_sdma_request *req,
1271
				struct user_sdma_txreq *tx, u32 datalen)
M
Mike Marciniszyn 已提交
1272
{
1273
	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1274
	int idx = 0;
1275
	u8 omfactor; /* KDETH.OM */
M
Mike Marciniszyn 已提交
1276 1277 1278
	struct hfi1_user_sdma_pkt_q *pq = req->pq;
	struct hfi1_pkt_header *hdr = &req->hdr;
	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1279
	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1280
	size_t array_size = ARRAY_SIZE(ahg);
M
Mike Marciniszyn 已提交
1281 1282 1283

	if (PBC2LRH(pbclen) != lrhlen) {
		/* PBC.PbcLengthDWs */
1284 1285 1286 1287
		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
		if (idx < 0)
			return idx;
M
Mike Marciniszyn 已提交
1288
		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1289 1290 1291 1292
		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
				     (__force u16)cpu_to_be16(lrhlen >> 2));
		if (idx < 0)
			return idx;
M
Mike Marciniszyn 已提交
1293 1294 1295 1296 1297 1298 1299 1300
	}

	/*
	 * Do the common updates
	 */
	/* BTH.PSN and BTH.A */
	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1301
	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
M
Mike Marciniszyn 已提交
1302
		val32 |= 1UL << 31;
1303 1304 1305 1306 1307 1308 1309 1310
	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
			     (__force u16)cpu_to_be16(val32 >> 16));
	if (idx < 0)
		return idx;
	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
			     (__force u16)cpu_to_be16(val32 & 0xffff));
	if (idx < 0)
		return idx;
M
Mike Marciniszyn 已提交
1311
	/* KDETH.Offset */
1312 1313 1314 1315 1316 1317 1318 1319
	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
	if (idx < 0)
		return idx;
	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
			     (__force u16)cpu_to_le16(req->koffset >> 16));
	if (idx < 0)
		return idx;
M
Mike Marciniszyn 已提交
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
	if (req_opcode(req->info.ctrl) == EXPECTED) {
		__le16 val;

		tidval = req->tids[req->tididx];

		/*
		 * If the offset puts us at the end of the current TID,
		 * advance everything.
		 */
		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
					 PAGE_SIZE)) {
			req->tidoffset = 0;
1332 1333 1334 1335
			/*
			 * Since we don't copy all the TIDs, all at once,
			 * we have to check again.
			 */
M
Mike Marciniszyn 已提交
1336
			if (++req->tididx > req->n_tids - 1 ||
1337
			    !req->tids[req->tididx])
M
Mike Marciniszyn 已提交
1338 1339 1340
				return -EINVAL;
			tidval = req->tids[req->tididx];
		}
1341
		omfactor = ((EXP_TID_GET(tidval, LEN) *
M
Mike Marciniszyn 已提交
1342
				  PAGE_SIZE) >=
1343 1344
				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
				 KDETH_OM_SMALL_SHIFT;
M
Mike Marciniszyn 已提交
1345
		/* KDETH.OM and KDETH.OFFSET (TID) */
1346 1347 1348
		idx = ahg_header_set(
				ahg, idx, array_size, 7, 0, 16,
				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1349
				((req->tidoffset >> omfactor)
1350 1351 1352
				& 0x7fff)));
		if (idx < 0)
			return idx;
1353
		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
M
Mike Marciniszyn 已提交
1354
		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1355 1356 1357 1358 1359 1360
				   (EXP_TID_GET(tidval, IDX) & 0x3ff));

		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
						      INTR) <<
					    AHG_KDETH_INTR_SHIFT));
1361
		} else {
1362 1363 1364 1365 1366
			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
						      INTR) <<
					     AHG_KDETH_INTR_SHIFT));
1367
		}
1368

1369 1370 1371 1372
		idx = ahg_header_set(ahg, idx, array_size,
				     7, 16, 14, (__force u16)val);
		if (idx < 0)
			return idx;
M
Mike Marciniszyn 已提交
1373 1374 1375 1376
	}

	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
					req->info.comp_idx, req->sde->this_idx,
1377
					req->ahg_idx, ahg, idx, tidval);
1378 1379
	sdma_txinit_ahg(&tx->txreq,
			SDMA_TXREQ_F_USE_AHG,
1380
			datalen, req->ahg_idx, idx,
1381 1382 1383
			ahg, sizeof(req->hdr),
			user_sdma_txreq_cb);

1384
	return idx;
M
Mike Marciniszyn 已提交
1385 1386
}

1387 1388 1389 1390 1391 1392 1393 1394 1395
/**
 * user_sdma_txreq_cb() - SDMA tx request completion callback.
 * @txreq: valid sdma tx request
 * @status: success/failure of request
 *
 * Called when the SDMA progress state machine gets notification that
 * the SDMA descriptors for this tx request have been processed by the
 * DMA engine. Called in interrupt context.
 * Only do work on completed sequences.
1396
 */
1397
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
M
Mike Marciniszyn 已提交
1398 1399 1400
{
	struct user_sdma_txreq *tx =
		container_of(txreq, struct user_sdma_txreq, txreq);
1401
	struct user_sdma_request *req;
1402 1403
	struct hfi1_user_sdma_pkt_q *pq;
	struct hfi1_user_sdma_comp_q *cq;
1404
	enum hfi1_sdma_comp_state state = COMPLETE;
M
Mike Marciniszyn 已提交
1405

1406
	if (!tx->req)
M
Mike Marciniszyn 已提交
1407 1408
		return;

1409
	req = tx->req;
1410 1411
	pq = req->pq;
	cq = req->cq;
M
Mike Marciniszyn 已提交
1412 1413

	if (status != SDMA_TXREQ_S_OK) {
1414 1415
		SDMA_DBG(req, "SDMA completion with error %d",
			 status);
1416
		WRITE_ONCE(req->has_error, 1);
1417
		state = ERROR;
1418 1419
	}

1420 1421
	req->seqcomp = tx->seqnum;
	kmem_cache_free(pq->txreq_cache, tx);
1422 1423 1424 1425 1426 1427 1428 1429

	/* sequence isn't complete?  We are done */
	if (req->seqcomp != req->info.npkts - 1)
		return;

	user_sdma_free_request(req, false);
	set_comp_state(pq, cq, req->info.comp_idx, state, status);
	pq_update(pq);
1430 1431
}

1432
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1433
{
1434
	if (atomic_dec_and_test(&pq->n_reqs))
1435
		wake_up(&pq->wait);
M
Mike Marciniszyn 已提交
1436 1437
}

1438
static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
M
Mike Marciniszyn 已提交
1439
{
1440 1441
	int i;

M
Mike Marciniszyn 已提交
1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
	if (!list_empty(&req->txps)) {
		struct sdma_txreq *t, *p;

		list_for_each_entry_safe(t, p, &req->txps, list) {
			struct user_sdma_txreq *tx =
				container_of(t, struct user_sdma_txreq, txreq);
			list_del_init(&t->list);
			sdma_txclean(req->pq->dd, t);
			kmem_cache_free(req->pq->txreq_cache, tx);
		}
	}
1453 1454 1455 1456 1457 1458 1459

	for (i = 0; i < req->data_iovs; i++) {
		struct sdma_mmu_node *node = req->iovs[i].node;

		if (!node)
			continue;

1460 1461
		req->iovs[i].node = NULL;

1462 1463 1464 1465 1466
		if (unpin)
			hfi1_mmu_rb_remove(req->pq->handler,
					   &node->rb);
		else
			atomic_dec(&node->refcount);
M
Mike Marciniszyn 已提交
1467
	}
1468

M
Mike Marciniszyn 已提交
1469
	kfree(req->tids);
1470
	clear_bit(req->info.comp_idx, req->pq->req_in_use);
M
Mike Marciniszyn 已提交
1471 1472
}

1473 1474 1475 1476
static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
				  struct hfi1_user_sdma_comp_q *cq,
				  u16 idx, enum hfi1_sdma_comp_state state,
				  int ret)
M
Mike Marciniszyn 已提交
1477 1478
{
	if (state == ERROR)
1479
		cq->comps[idx].errcode = -ret;
1480 1481
	smp_wmb(); /* make sure errcode is visible first */
	cq->comps[idx].status = state;
1482 1483
	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
					idx, state, ret);
M
Mike Marciniszyn 已提交
1484
}
1485 1486 1487 1488 1489 1490 1491

static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
			   unsigned long len)
{
	return (bool)(node->addr == addr);
}

1492
static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1493 1494 1495 1496 1497 1498 1499 1500
{
	struct sdma_mmu_node *node =
		container_of(mnode, struct sdma_mmu_node, rb);

	atomic_inc(&node->refcount);
	return 0;
}

D
Dean Luick 已提交
1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
/*
 * Return 1 to remove the node from the rb tree and call the remove op.
 *
 * Called with the rb tree lock held.
 */
static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
			 void *evict_arg, bool *stop)
{
	struct sdma_mmu_node *node =
		container_of(mnode, struct sdma_mmu_node, rb);
	struct evict_data *evict_data = evict_arg;

	/* is this node still being used? */
	if (atomic_read(&node->refcount))
		return 0; /* keep this node */

	/* this node will be evicted, add its pages to our count */
	evict_data->cleared += node->npages;

	/* have enough pages been cleared? */
	if (evict_data->cleared >= evict_data->target)
		*stop = true;

	return 1; /* remove this node */
}

1527
static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1528 1529 1530 1531
{
	struct sdma_mmu_node *node =
		container_of(mnode, struct sdma_mmu_node, rb);

1532
	unpin_sdma_pages(node);
1533 1534 1535
	kfree(node);
}

1536
static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1537 1538 1539 1540 1541 1542 1543
{
	struct sdma_mmu_node *node =
		container_of(mnode, struct sdma_mmu_node, rb);

	if (!atomic_read(&node->refcount))
		return 1;
	return 0;
M
Mike Marciniszyn 已提交
1544
}