umem.c 8.2 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
4
 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/mm.h>
#include <linux/dma-mapping.h>
A
Alexey Dobriyan 已提交
37
#include <linux/sched.h>
38
#include <linux/export.h>
39
#include <linux/hugetlb.h>
40
#include <linux/dma-attrs.h>
41
#include <linux/slab.h>
42 43 44

#include "uverbs.h"

45

46 47
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
48 49
	struct scatterlist *sg;
	struct page *page;
50 51
	int i;

52 53 54 55
	if (umem->nmap > 0)
		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
				umem->nmap,
				DMA_BIDIRECTIONAL);
J
Jens Axboe 已提交
56

57
	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
58

59 60 61 62
		page = sg_page(sg);
		if (umem->writable && dirty)
			set_page_dirty_lock(page);
		put_page(page);
63
	}
64 65 66 67

	sg_free_table(&umem->sg_head);
	return;

68 69
}

70 71 72 73 74 75
/**
 * ib_umem_get - Pin and DMA map userspace memory.
 * @context: userspace context to pin memory for
 * @addr: userspace virtual address to start at
 * @size: length of region to pin
 * @access: IB_ACCESS_xxx flags for memory being pinned
76
 * @dmasync: flush in-flight DMA when the memory region is written
77 78
 */
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
79
			    size_t size, int access, int dmasync)
80
{
81
	struct ib_umem *umem;
82
	struct page **page_list;
83
	struct vm_area_struct **vma_list;
84 85 86 87
	unsigned long locked;
	unsigned long lock_limit;
	unsigned long cur_base;
	unsigned long npages;
88
	int ret;
89
	int i;
90
	DEFINE_DMA_ATTRS(attrs);
91 92
	struct scatterlist *sg, *sg_list_start;
	int need_release = 0;
93 94 95

	if (dmasync)
		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
96 97

	if (!can_do_mlock())
98
		return ERR_PTR(-EPERM);
99

100
	umem = kzalloc(sizeof *umem, GFP_KERNEL);
101 102
	if (!umem)
		return ERR_PTR(-ENOMEM);
103

104 105
	umem->context   = context;
	umem->length    = size;
106
	umem->address   = addr;
107
	umem->page_size = PAGE_SIZE;
108
	umem->pid       = get_task_pid(current, PIDTYPE_PID);
109
	/*
110 111
	 * We ask for writable memory if any of the following
	 * access flags are set.  "Local write" and "remote write"
112 113 114 115
	 * obviously require write access.  "Remote atomic" can do
	 * things like fetch and add, which will modify memory, and
	 * "MW bind" can change permissions by binding a window.
	 */
116 117 118
	umem->writable  = !!(access &
		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
119

120 121 122
	/* We assume the memory is from hugetlb until proved otherwise */
	umem->hugetlb   = 1;

123 124 125 126 127
	page_list = (struct page **) __get_free_page(GFP_KERNEL);
	if (!page_list) {
		kfree(umem);
		return ERR_PTR(-ENOMEM);
	}
128

129 130 131 132 133 134 135 136
	/*
	 * if we can't alloc the vma_list, it's not so bad;
	 * just assume the memory is not hugetlb memory
	 */
	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
	if (!vma_list)
		umem->hugetlb = 0;

137
	npages = ib_umem_num_pages(umem);
138 139 140

	down_write(&current->mm->mmap_sem);

141
	locked     = npages + current->mm->pinned_vm;
J
Jiri Slaby 已提交
142
	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
143 144 145 146 147 148

	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
		ret = -ENOMEM;
		goto out;
	}

149
	cur_base = addr & PAGE_MASK;
150

151 152 153 154 155 156 157 158 159 160 161 162
	if (npages == 0) {
		ret = -EINVAL;
		goto out;
	}

	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
	if (ret)
		goto out;

	need_release = 1;
	sg_list_start = umem->sg_head.sgl;

163 164
	while (npages) {
		ret = get_user_pages(current, current->mm, cur_base,
165
				     min_t(unsigned long, npages,
166
					   PAGE_SIZE / sizeof (struct page *)),
167
				     1, !umem->writable, page_list, vma_list);
168 169 170 171

		if (ret < 0)
			goto out;

172
		umem->npages += ret;
173 174 175
		cur_base += ret * PAGE_SIZE;
		npages   -= ret;

176 177 178 179 180
		for_each_sg(sg_list_start, sg, ret, i) {
			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
				umem->hugetlb = 0;

			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
181 182
		}

183 184
		/* preparing for next loop */
		sg_list_start = sg;
185 186
	}

187 188 189 190 191 192 193 194 195 196 197 198 199
	umem->nmap = ib_dma_map_sg_attrs(context->device,
				  umem->sg_head.sgl,
				  umem->npages,
				  DMA_BIDIRECTIONAL,
				  &attrs);

	if (umem->nmap <= 0) {
		ret = -ENOMEM;
		goto out;
	}

	ret = 0;

200
out:
201
	if (ret < 0) {
202 203
		if (need_release)
			__ib_umem_release(context->device, umem, 0);
204
		put_pid(umem->pid);
205 206
		kfree(umem);
	} else
207
		current->mm->pinned_vm = locked;
208 209

	up_write(&current->mm->mmap_sem);
210 211
	if (vma_list)
		free_page((unsigned long) vma_list);
212 213
	free_page((unsigned long) page_list);

214
	return ret < 0 ? ERR_PTR(ret) : umem;
215
}
216
EXPORT_SYMBOL(ib_umem_get);
217

218
static void ib_umem_account(struct work_struct *work)
219
{
220 221 222
	struct ib_umem *umem = container_of(work, struct ib_umem, work);

	down_write(&umem->mm->mmap_sem);
223
	umem->mm->pinned_vm -= umem->diff;
224 225 226
	up_write(&umem->mm->mmap_sem);
	mmput(umem->mm);
	kfree(umem);
227 228
}

229 230 231 232 233
/**
 * ib_umem_release - release memory pinned with ib_umem_get
 * @umem: umem struct to release
 */
void ib_umem_release(struct ib_umem *umem)
234
{
235
	struct ib_ucontext *context = umem->context;
236
	struct mm_struct *mm;
237
	struct task_struct *task;
238
	unsigned long diff;
239

240
	__ib_umem_release(umem->context->device, umem, 1);
241

242 243 244 245 246 247 248 249
	task = get_pid_task(umem->pid, PIDTYPE_PID);
	put_pid(umem->pid);
	if (!task)
		goto out;
	mm = get_task_mm(task);
	put_task_struct(task);
	if (!mm)
		goto out;
250

251
	diff = ib_umem_num_pages(umem);
252

253 254 255 256 257
	/*
	 * We may be called with the mm's mmap_sem already held.  This
	 * can happen when a userspace munmap() is the call that drops
	 * the last reference to our file and calls our release
	 * method.  If there are memory regions to destroy, we'll end
258 259
	 * up here and not be able to take the mmap_sem.  In that case
	 * we defer the vm_locked accounting to the system workqueue.
260
	 */
261 262 263 264 265 266
	if (context->closing) {
		if (!down_write_trylock(&mm->mmap_sem)) {
			INIT_WORK(&umem->work, ib_umem_account);
			umem->mm   = mm;
			umem->diff = diff;

T
Tejun Heo 已提交
267
			queue_work(ib_wq, &umem->work);
268 269
			return;
		}
270 271 272
	} else
		down_write(&mm->mmap_sem);

273
	mm->pinned_vm -= diff;
274 275
	up_write(&mm->mmap_sem);
	mmput(mm);
276
out:
277
	kfree(umem);
278 279 280 281 282 283 284 285
}
EXPORT_SYMBOL(ib_umem_release);

int ib_umem_page_count(struct ib_umem *umem)
{
	int shift;
	int i;
	int n;
286
	struct scatterlist *sg;
287 288

	shift = ilog2(umem->page_size);
289

290
	n = 0;
291 292
	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
		n += sg_dma_len(sg) >> shift;
293

294
	return n;
295
}
296
EXPORT_SYMBOL(ib_umem_page_count);
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330

/*
 * Copy from the given ib_umem's pages to the given buffer.
 *
 * umem - the umem to copy from
 * offset - offset to start copying from
 * dst - destination buffer
 * length - buffer length
 *
 * Returns 0 on success, or an error code.
 */
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
		      size_t length)
{
	size_t end = offset + length;
	int ret;

	if (offset > umem->length || length > umem->length - offset) {
		pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
		       offset, umem->length, end);
		return -EINVAL;
	}

	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
				 offset + ib_umem_offset(umem));

	if (ret < 0)
		return ret;
	else if (ret != length)
		return -EINVAL;
	else
		return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);