umem.c 10.8 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
4
 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/mm.h>
#include <linux/dma-mapping.h>
37
#include <linux/sched/signal.h>
38
#include <linux/sched/mm.h>
39
#include <linux/export.h>
40
#include <linux/slab.h>
41
#include <linux/pagemap.h>
42
#include <rdma/ib_umem_odp.h>
43 44 45 46 47

#include "uverbs.h"

static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
48
	struct sg_page_iter sg_iter;
49
	struct page *page;
50

51
	if (umem->nmap > 0)
52
		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
53
				DMA_BIDIRECTIONAL);
J
Jens Axboe 已提交
54

55 56
	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
		page = sg_page_iter_page(&sg_iter);
57
		if (!PageDirty(page) && umem->writable && dirty)
58 59
			set_page_dirty_lock(page);
		put_page(page);
60
	}
61 62

	sg_free_table(&umem->sg_head);
63 64
}

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
 *
 * sg: current scatterlist entry
 * page_list: array of npage struct page pointers
 * npages: number of pages in page_list
 * max_seg_sz: maximum segment size in bytes
 * nents: [out] number of entries in the scatterlist
 *
 * Return new end of scatterlist
 */
static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
						struct page **page_list,
						unsigned long npages,
						unsigned int max_seg_sz,
						int *nents)
{
	unsigned long first_pfn;
	unsigned long i = 0;
	bool update_cur_sg = false;
	bool first = !sg_page(sg);

	/* Check if new page_list is contiguous with end of previous page_list.
	 * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
	 */
	if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
		       page_to_pfn(page_list[0])))
		update_cur_sg = true;

	while (i != npages) {
		unsigned long len;
		struct page *first_page = page_list[i];

		first_pfn = page_to_pfn(first_page);

		/* Compute the number of contiguous pages we have starting
		 * at i
		 */
		for (len = 0; i != npages &&
103 104
			      first_pfn + len == page_to_pfn(page_list[i]) &&
			      len < (max_seg_sz >> PAGE_SHIFT);
105 106 107 108
		     len++)
			i++;

		/* Squash N contiguous pages from page_list into current sge */
109 110 111 112 113 114 115 116
		if (update_cur_sg) {
			if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
				sg_set_page(sg, sg_page(sg),
					    sg->length + (len << PAGE_SHIFT),
					    0);
				update_cur_sg = false;
				continue;
			}
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
			update_cur_sg = false;
		}

		/* Squash N contiguous pages into next sge or first sge */
		if (!first)
			sg = sg_next(sg);

		(*nents)++;
		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
		first = false;
	}

	return sg;
}

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
/**
 * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
 *
 * @umem: umem struct
 * @pgsz_bitmap: bitmap of HW supported page sizes
 * @virt: IOVA
 *
 * This helper is intended for HW that support multiple page
 * sizes but can do only a single page size in an MR.
 *
 * Returns 0 if the umem requires page sizes not supported by
 * the driver to be mapped. Drivers always supporting PAGE_SIZE
 * or smaller will never see a 0 result.
 */
unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
				     unsigned long pgsz_bitmap,
				     unsigned long virt)
{
	struct scatterlist *sg;
	unsigned int best_pg_bit;
	unsigned long va, pgoff;
	dma_addr_t mask;
	int i;

	/* At minimum, drivers must support PAGE_SIZE or smaller */
	if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
		return 0;

	va = virt;
	/* max page size not to exceed MR length */
	mask = roundup_pow_of_two(umem->length);
	/* offset into first SGL */
	pgoff = umem->address & ~PAGE_MASK;

	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
		/* Walk SGL and reduce max page size if VA/PA bits differ
		 * for any address.
		 */
		mask |= (sg_dma_address(sg) + pgoff) ^ va;
		if (i && i != (umem->nmap - 1))
			/* restrict by length as well for interior SGEs */
			mask |= sg_dma_len(sg);
		va += sg_dma_len(sg) - pgoff;
		pgoff = 0;
	}
	best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);

	return BIT_ULL(best_pg_bit);
}
EXPORT_SYMBOL(ib_umem_find_best_pgsz);

183 184
/**
 * ib_umem_get - Pin and DMA map userspace memory.
185 186
 *
 * If access flags indicate ODP memory, avoid pinning. Instead, stores
187
 * the mm for future page fault handling in conjunction with MMU notifiers.
188
 *
189
 * @udata: userspace context to pin memory for
190 191 192
 * @addr: userspace virtual address to start at
 * @size: length of region to pin
 * @access: IB_ACCESS_xxx flags for memory being pinned
193
 * @dmasync: flush in-flight DMA when the memory region is written
194
 */
195
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
196
			    size_t size, int access, int dmasync)
197
{
198
	struct ib_ucontext *context;
199
	struct ib_umem *umem;
200 201
	struct page **page_list;
	unsigned long lock_limit;
202
	unsigned long new_pinned;
203
	unsigned long cur_base;
204
	struct mm_struct *mm;
205
	unsigned long npages;
206
	int ret;
207
	unsigned long dma_attrs = 0;
208
	struct scatterlist *sg;
209
	unsigned int gup_flags = FOLL_WRITE;
210

211 212 213 214 215 216 217
	if (!udata)
		return ERR_PTR(-EIO);

	context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
			  ->context;
	if (!context)
		return ERR_PTR(-EIO);
218

219
	if (dmasync)
220
		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
221

222 223 224 225
	/*
	 * If the combination of the addr and size requested for this memory
	 * region causes an integer overflow, return error.
	 */
226 227
	if (((addr + size) < addr) ||
	    PAGE_ALIGN(addr + size) < (addr + size))
228 229
		return ERR_PTR(-EINVAL);

230
	if (!can_do_mlock())
231
		return ERR_PTR(-EPERM);
232

233 234 235 236
	if (access & IB_ACCESS_ON_DEMAND) {
		umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
		if (!umem)
			return ERR_PTR(-ENOMEM);
237
		umem->is_odp = 1;
238 239 240 241 242
	} else {
		umem = kzalloc(sizeof(*umem), GFP_KERNEL);
		if (!umem)
			return ERR_PTR(-ENOMEM);
	}
243

244 245 246 247
	umem->context    = context;
	umem->length     = size;
	umem->address    = addr;
	umem->page_shift = PAGE_SHIFT;
248
	umem->writable   = ib_access_writable(access);
249 250
	umem->owning_mm = mm = current->mm;
	mmgrab(mm);
251

252
	if (access & IB_ACCESS_ON_DEMAND) {
253 254 255 256 257
		if (WARN_ON_ONCE(!context->invalidate_range)) {
			ret = -EINVAL;
			goto umem_kfree;
		}

258
		ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
259 260
		if (ret)
			goto umem_kfree;
261 262 263
		return umem;
	}

264 265
	page_list = (struct page **) __get_free_page(GFP_KERNEL);
	if (!page_list) {
266
		ret = -ENOMEM;
267
		goto umem_kfree;
268
	}
269

270
	npages = ib_umem_num_pages(umem);
D
Doug Ledford 已提交
271 272 273 274
	if (npages == 0 || npages > UINT_MAX) {
		ret = -EINVAL;
		goto out;
	}
275

J
Jiri Slaby 已提交
276
	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
277

278
	new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
279
	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
280
		atomic64_sub(npages, &mm->pinned_vm);
281
		ret = -ENOMEM;
282
		goto out;
283 284
	}

285
	cur_base = addr & PAGE_MASK;
286

287 288
	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
	if (ret)
289
		goto vma;
290

291 292 293
	if (!umem->writable)
		gup_flags |= FOLL_FORCE;

294
	sg = umem->sg_head.sgl;
295

296
	while (npages) {
297
		down_read(&mm->mmap_sem);
298
		ret = get_user_pages_longterm(cur_base,
299
				     min_t(unsigned long, npages,
300
					   PAGE_SIZE / sizeof (struct page *)),
S
Shiraz Saleem 已提交
301
				     gup_flags, page_list, NULL);
302
		if (ret < 0) {
303
			up_read(&mm->mmap_sem);
304
			goto umem_release;
305
		}
306 307 308 309

		cur_base += ret * PAGE_SIZE;
		npages   -= ret;

310 311 312 313 314
		sg = ib_umem_add_sg_table(sg, page_list, ret,
			dma_get_max_seg_size(context->device->dma_device),
			&umem->sg_nents);

		up_read(&mm->mmap_sem);
315 316
	}

317 318
	sg_mark_end(sg);

319 320
	umem->nmap = ib_dma_map_sg_attrs(context->device,
				  umem->sg_head.sgl,
321
				  umem->sg_nents,
322
				  DMA_BIDIRECTIONAL,
323
				  dma_attrs);
324

325
	if (!umem->nmap) {
326
		ret = -ENOMEM;
327
		goto umem_release;
328 329 330
	}

	ret = 0;
331
	goto out;
332

333 334 335
umem_release:
	__ib_umem_release(context->device, umem, 0);
vma:
336
	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
337 338
out:
	free_page((unsigned long) page_list);
339
umem_kfree:
340 341
	if (ret) {
		mmdrop(umem->owning_mm);
342
		kfree(umem);
343
	}
344
	return ret ? ERR_PTR(ret) : umem;
345
}
346
EXPORT_SYMBOL(ib_umem_get);
347

348 349 350
static void __ib_umem_release_tail(struct ib_umem *umem)
{
	mmdrop(umem->owning_mm);
351
	if (umem->is_odp)
352 353 354
		kfree(to_ib_umem_odp(umem));
	else
		kfree(umem);
355 356
}

357 358 359 360 361
/**
 * ib_umem_release - release memory pinned with ib_umem_get
 * @umem: umem struct to release
 */
void ib_umem_release(struct ib_umem *umem)
362
{
363
	if (umem->is_odp) {
364
		ib_umem_odp_release(to_ib_umem_odp(umem));
365
		__ib_umem_release_tail(umem);
366 367 368
		return;
	}

369
	__ib_umem_release(umem->context->device, umem, 1);
370

371
	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
372
	__ib_umem_release_tail(umem);
373 374 375 376 377 378 379
}
EXPORT_SYMBOL(ib_umem_release);

int ib_umem_page_count(struct ib_umem *umem)
{
	int i;
	int n;
380
	struct scatterlist *sg;
381

382
	if (umem->is_odp)
383 384
		return ib_umem_num_pages(umem);

385
	n = 0;
386
	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
387
		n += sg_dma_len(sg) >> umem->page_shift;
388

389
	return n;
390
}
391
EXPORT_SYMBOL(ib_umem_page_count);
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414

/*
 * Copy from the given ib_umem's pages to the given buffer.
 *
 * umem - the umem to copy from
 * offset - offset to start copying from
 * dst - destination buffer
 * length - buffer length
 *
 * Returns 0 on success, or an error code.
 */
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
		      size_t length)
{
	size_t end = offset + length;
	int ret;

	if (offset > umem->length || length > umem->length - offset) {
		pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
		       offset, umem->length, end);
		return -EINVAL;
	}

415 416
	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
				 offset + ib_umem_offset(umem));
417 418 419 420 421 422 423 424 425

	if (ret < 0)
		return ret;
	else if (ret != length)
		return -EINVAL;
	else
		return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);