umem.c 8.9 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
4
 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/mm.h>
#include <linux/dma-mapping.h>
A
Alexey Dobriyan 已提交
37
#include <linux/sched.h>
38
#include <linux/export.h>
39
#include <linux/hugetlb.h>
40
#include <linux/slab.h>
41
#include <rdma/ib_umem_odp.h>
42 43 44

#include "uverbs.h"

45

46 47
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
48 49
	struct scatterlist *sg;
	struct page *page;
50 51
	int i;

52 53 54 55
	if (umem->nmap > 0)
		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
				umem->nmap,
				DMA_BIDIRECTIONAL);
J
Jens Axboe 已提交
56

57
	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
58

59 60 61 62
		page = sg_page(sg);
		if (umem->writable && dirty)
			set_page_dirty_lock(page);
		put_page(page);
63
	}
64 65 66 67

	sg_free_table(&umem->sg_head);
	return;

68 69
}

70 71
/**
 * ib_umem_get - Pin and DMA map userspace memory.
72 73
 *
 * If access flags indicate ODP memory, avoid pinning. Instead, stores
74
 * the mm for future page fault handling in conjunction with MMU notifiers.
75
 *
76 77 78 79
 * @context: userspace context to pin memory for
 * @addr: userspace virtual address to start at
 * @size: length of region to pin
 * @access: IB_ACCESS_xxx flags for memory being pinned
80
 * @dmasync: flush in-flight DMA when the memory region is written
81 82
 */
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
83
			    size_t size, int access, int dmasync)
84
{
85
	struct ib_umem *umem;
86
	struct page **page_list;
87
	struct vm_area_struct **vma_list;
88 89 90 91
	unsigned long locked;
	unsigned long lock_limit;
	unsigned long cur_base;
	unsigned long npages;
92
	int ret;
93
	int i;
94
	unsigned long dma_attrs = 0;
95 96
	struct scatterlist *sg, *sg_list_start;
	int need_release = 0;
97 98

	if (dmasync)
99
		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
100

101 102 103
	if (!size)
		return ERR_PTR(-EINVAL);

104 105 106 107
	/*
	 * If the combination of the addr and size requested for this memory
	 * region causes an integer overflow, return error.
	 */
108 109
	if (((addr + size) < addr) ||
	    PAGE_ALIGN(addr + size) < (addr + size))
110 111
		return ERR_PTR(-EINVAL);

112
	if (!can_do_mlock())
113
		return ERR_PTR(-EPERM);
114

115
	umem = kzalloc(sizeof *umem, GFP_KERNEL);
116 117
	if (!umem)
		return ERR_PTR(-ENOMEM);
118

119 120
	umem->context   = context;
	umem->length    = size;
121
	umem->address   = addr;
122
	umem->page_size = PAGE_SIZE;
123
	umem->pid       = get_task_pid(current, PIDTYPE_PID);
124
	/*
125 126
	 * We ask for writable memory if any of the following
	 * access flags are set.  "Local write" and "remote write"
127 128 129 130
	 * obviously require write access.  "Remote atomic" can do
	 * things like fetch and add, which will modify memory, and
	 * "MW bind" can change permissions by binding a window.
	 */
131 132 133
	umem->writable  = !!(access &
		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
134

135 136 137 138 139 140 141 142 143 144 145
	if (access & IB_ACCESS_ON_DEMAND) {
		ret = ib_umem_odp_get(context, umem);
		if (ret) {
			kfree(umem);
			return ERR_PTR(ret);
		}
		return umem;
	}

	umem->odp_data = NULL;

146 147 148
	/* We assume the memory is from hugetlb until proved otherwise */
	umem->hugetlb   = 1;

149 150 151 152 153
	page_list = (struct page **) __get_free_page(GFP_KERNEL);
	if (!page_list) {
		kfree(umem);
		return ERR_PTR(-ENOMEM);
	}
154

155 156 157 158 159 160 161 162
	/*
	 * if we can't alloc the vma_list, it's not so bad;
	 * just assume the memory is not hugetlb memory
	 */
	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
	if (!vma_list)
		umem->hugetlb = 0;

163
	npages = ib_umem_num_pages(umem);
164 165 166

	down_write(&current->mm->mmap_sem);

167
	locked     = npages + current->mm->pinned_vm;
J
Jiri Slaby 已提交
168
	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
169 170 171 172 173 174

	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
		ret = -ENOMEM;
		goto out;
	}

175
	cur_base = addr & PAGE_MASK;
176

177 178 179 180 181 182 183 184 185 186 187 188
	if (npages == 0) {
		ret = -EINVAL;
		goto out;
	}

	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
	if (ret)
		goto out;

	need_release = 1;
	sg_list_start = umem->sg_head.sgl;

189
	while (npages) {
190
		ret = get_user_pages(cur_base,
191
				     min_t(unsigned long, npages,
192
					   PAGE_SIZE / sizeof (struct page *)),
193
				     1, !umem->writable, page_list, vma_list);
194 195 196 197

		if (ret < 0)
			goto out;

198
		umem->npages += ret;
199 200 201
		cur_base += ret * PAGE_SIZE;
		npages   -= ret;

202 203 204 205 206
		for_each_sg(sg_list_start, sg, ret, i) {
			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
				umem->hugetlb = 0;

			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
207 208
		}

209 210
		/* preparing for next loop */
		sg_list_start = sg;
211 212
	}

213 214 215 216
	umem->nmap = ib_dma_map_sg_attrs(context->device,
				  umem->sg_head.sgl,
				  umem->npages,
				  DMA_BIDIRECTIONAL,
217
				  dma_attrs);
218 219 220 221 222 223 224 225

	if (umem->nmap <= 0) {
		ret = -ENOMEM;
		goto out;
	}

	ret = 0;

226
out:
227
	if (ret < 0) {
228 229
		if (need_release)
			__ib_umem_release(context->device, umem, 0);
230
		put_pid(umem->pid);
231 232
		kfree(umem);
	} else
233
		current->mm->pinned_vm = locked;
234 235

	up_write(&current->mm->mmap_sem);
236 237
	if (vma_list)
		free_page((unsigned long) vma_list);
238 239
	free_page((unsigned long) page_list);

240
	return ret < 0 ? ERR_PTR(ret) : umem;
241
}
242
EXPORT_SYMBOL(ib_umem_get);
243

244
static void ib_umem_account(struct work_struct *work)
245
{
246 247 248
	struct ib_umem *umem = container_of(work, struct ib_umem, work);

	down_write(&umem->mm->mmap_sem);
249
	umem->mm->pinned_vm -= umem->diff;
250 251 252
	up_write(&umem->mm->mmap_sem);
	mmput(umem->mm);
	kfree(umem);
253 254
}

255 256 257 258 259
/**
 * ib_umem_release - release memory pinned with ib_umem_get
 * @umem: umem struct to release
 */
void ib_umem_release(struct ib_umem *umem)
260
{
261
	struct ib_ucontext *context = umem->context;
262
	struct mm_struct *mm;
263
	struct task_struct *task;
264
	unsigned long diff;
265

266 267 268 269 270
	if (umem->odp_data) {
		ib_umem_odp_release(umem);
		return;
	}

271
	__ib_umem_release(umem->context->device, umem, 1);
272

273 274 275 276 277 278 279 280
	task = get_pid_task(umem->pid, PIDTYPE_PID);
	put_pid(umem->pid);
	if (!task)
		goto out;
	mm = get_task_mm(task);
	put_task_struct(task);
	if (!mm)
		goto out;
281

282
	diff = ib_umem_num_pages(umem);
283

284 285 286 287 288
	/*
	 * We may be called with the mm's mmap_sem already held.  This
	 * can happen when a userspace munmap() is the call that drops
	 * the last reference to our file and calls our release
	 * method.  If there are memory regions to destroy, we'll end
289 290
	 * up here and not be able to take the mmap_sem.  In that case
	 * we defer the vm_locked accounting to the system workqueue.
291
	 */
292 293 294 295 296 297
	if (context->closing) {
		if (!down_write_trylock(&mm->mmap_sem)) {
			INIT_WORK(&umem->work, ib_umem_account);
			umem->mm   = mm;
			umem->diff = diff;

T
Tejun Heo 已提交
298
			queue_work(ib_wq, &umem->work);
299 300
			return;
		}
301 302 303
	} else
		down_write(&mm->mmap_sem);

304
	mm->pinned_vm -= diff;
305 306
	up_write(&mm->mmap_sem);
	mmput(mm);
307
out:
308
	kfree(umem);
309 310 311 312 313 314 315 316
}
EXPORT_SYMBOL(ib_umem_release);

int ib_umem_page_count(struct ib_umem *umem)
{
	int shift;
	int i;
	int n;
317
	struct scatterlist *sg;
318

319 320 321
	if (umem->odp_data)
		return ib_umem_num_pages(umem);

322
	shift = ilog2(umem->page_size);
323

324
	n = 0;
325 326
	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
		n += sg_dma_len(sg) >> shift;
327

328
	return n;
329
}
330
EXPORT_SYMBOL(ib_umem_page_count);
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364

/*
 * Copy from the given ib_umem's pages to the given buffer.
 *
 * umem - the umem to copy from
 * offset - offset to start copying from
 * dst - destination buffer
 * length - buffer length
 *
 * Returns 0 on success, or an error code.
 */
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
		      size_t length)
{
	size_t end = offset + length;
	int ret;

	if (offset > umem->length || length > umem->length - offset) {
		pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
		       offset, umem->length, end);
		return -EINVAL;
	}

	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
				 offset + ib_umem_offset(umem));

	if (ret < 0)
		return ret;
	else if (ret != length)
		return -EINVAL;
	else
		return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);