gk20a.c 12.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * GK20A does not have dedicated video memory, and to accurately represent this
 * fact Nouveau will not create a RAM device for it. Therefore its instmem
 * implementation must be done directly on top of system memory, while providing
 * coherent read and write operations.
 *
 * Instmem can be allocated through two means:
 * 1) If an IOMMU mapping has been probed, the IOMMU API is used to make memory
 *    pages contiguous to the GPU. This is the preferred way.
 * 2) If no IOMMU mapping is probed, the DMA API is used to allocate physically
 *    contiguous memory.
 *
 * In both cases CPU read and writes are performed using PRAMIN (i.e. using the
 * GPU path) to ensure these operations are coherent for the GPU. This allows us
 * to use more "relaxed" allocation parameters when using the DMA API, since we
 * never need a kernel mapping.
 */
40 41
#define gk20a_instmem(p) container_of((p), struct gk20a_instmem, base)
#include "priv.h"
42

43
#include <core/memory.h>
44
#include <core/mm.h>
45
#include <subdev/fb.h>
46

47 48
#ifdef __KERNEL__
#include <linux/dma-attrs.h>
49 50
#include <linux/iommu.h>
#include <nouveau_platform.h>
51 52
#endif

53
#define gk20a_instobj(p) container_of((p), struct gk20a_instobj, memory)
54

B
Ben Skeggs 已提交
55
struct gk20a_instobj {
56 57 58
	struct nvkm_memory memory;
	struct gk20a_instmem *imem;
	struct nvkm_mem mem;
59 60 61 62 63 64
};

/*
 * Used for objects allocated using the DMA API
 */
struct gk20a_instobj_dma {
B
Ben Skeggs 已提交
65
	struct gk20a_instobj base;
66

67 68 69 70 71
	void *cpuaddr;
	dma_addr_t handle;
	struct nvkm_mm_node r;
};

72 73 74 75
/*
 * Used for objects flattened using the IOMMU API
 */
struct gk20a_instobj_iommu {
B
Ben Skeggs 已提交
76
	struct gk20a_instobj base;
77 78 79 80 81

	/* array of base.mem->size pages */
	struct page *pages[];
};

B
Ben Skeggs 已提交
82
struct gk20a_instmem {
83
	struct nvkm_instmem base;
84
	unsigned long lock_flags;
85 86
	spinlock_t lock;
	u64 addr;
87 88 89 90 91 92 93 94

	/* Only used if IOMMU if present */
	struct mutex *mm_mutex;
	struct nvkm_mm *mm;
	struct iommu_domain *domain;
	unsigned long iommu_pgshift;

	/* Only used by DMA API */
95
	struct dma_attrs attrs;
96 97
};

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
static enum nvkm_memory_target
gk20a_instobj_target(struct nvkm_memory *memory)
{
	return NVKM_MEM_TARGET_HOST;
}

static u64
gk20a_instobj_addr(struct nvkm_memory *memory)
{
	return gk20a_instobj(memory)->mem.offset;

}

static u64
gk20a_instobj_size(struct nvkm_memory *memory)
{
	return (u64)gk20a_instobj(memory)->mem.size << 12;
}

static void __iomem *
gk20a_instobj_acquire(struct nvkm_memory *memory)
{
	struct gk20a_instmem *imem = gk20a_instobj(memory)->imem;
	unsigned long flags;
	spin_lock_irqsave(&imem->lock, flags);
	imem->lock_flags = flags;
	return NULL;
}

static void
gk20a_instobj_release(struct nvkm_memory *memory)
{
	struct gk20a_instmem *imem = gk20a_instobj(memory)->imem;
	spin_unlock_irqrestore(&imem->lock, imem->lock_flags);
}

134 135 136 137 138 139 140 141 142
/*
 * Use PRAMIN to read/write data and avoid coherency issues.
 * PRAMIN uses the GPU path and ensures data will always be coherent.
 *
 * A dynamic mapping based solution would be desirable in the future, but
 * the issue remains of how to maintain coherency efficiently. On ARM it is
 * not easy (if possible at all?) to create uncached temporary mappings.
 */

143
static u32
144
gk20a_instobj_rd32(struct nvkm_memory *memory, u64 offset)
145
{
146 147
	struct gk20a_instobj *node = gk20a_instobj(memory);
	struct gk20a_instmem *imem = node->imem;
148
	struct nvkm_device *device = imem->base.subdev.device;
149 150
	u64 base = (node->mem.offset + offset) & 0xffffff00000ULL;
	u64 addr = (node->mem.offset + offset) & 0x000000fffffULL;
151 152
	u32 data;

B
Ben Skeggs 已提交
153
	if (unlikely(imem->addr != base)) {
154
		nvkm_wr32(device, 0x001700, base >> 16);
B
Ben Skeggs 已提交
155
		imem->addr = base;
156
	}
157
	data = nvkm_rd32(device, 0x700000 + addr);
158 159 160 161
	return data;
}

static void
162
gk20a_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
163
{
164 165
	struct gk20a_instobj *node = gk20a_instobj(memory);
	struct gk20a_instmem *imem = node->imem;
166
	struct nvkm_device *device = imem->base.subdev.device;
167 168
	u64 base = (node->mem.offset + offset) & 0xffffff00000ULL;
	u64 addr = (node->mem.offset + offset) & 0x000000fffffULL;
169

B
Ben Skeggs 已提交
170
	if (unlikely(imem->addr != base)) {
171
		nvkm_wr32(device, 0x001700, base >> 16);
B
Ben Skeggs 已提交
172
		imem->addr = base;
173
	}
174
	nvkm_wr32(device, 0x700000 + addr, data);
175 176 177 178 179 180 181
}

static void
gk20a_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset)
{
	struct gk20a_instobj *node = gk20a_instobj(memory);
	nvkm_vm_map_at(vma, offset, &node->mem);
182 183 184
}

static void
B
Ben Skeggs 已提交
185
gk20a_instobj_dtor_dma(struct gk20a_instobj *_node)
186
{
187
	struct gk20a_instobj_dma *node = (void *)_node;
188
	struct gk20a_instmem *imem = _node->imem;
189
	struct device *dev = nv_device_base(imem->base.subdev.device);
190

191
	if (unlikely(!node->cpuaddr))
192 193
		return;

194
	dma_free_attrs(dev, _node->mem.size << PAGE_SHIFT, node->cpuaddr,
B
Ben Skeggs 已提交
195
		       node->handle, &imem->attrs);
196 197 198
}

static void
B
Ben Skeggs 已提交
199
gk20a_instobj_dtor_iommu(struct gk20a_instobj *_node)
200 201
{
	struct gk20a_instobj_iommu *node = (void *)_node;
202
	struct gk20a_instmem *imem = _node->imem;
203 204 205
	struct nvkm_mm_node *r;
	int i;

206
	if (unlikely(list_empty(&_node->mem.regions)))
207 208
		return;

209
	r = list_first_entry(&_node->mem.regions, struct nvkm_mm_node,
210 211 212
			     rl_entry);

	/* clear bit 34 to unmap pages */
B
Ben Skeggs 已提交
213
	r->offset &= ~BIT(34 - imem->iommu_pgshift);
214 215

	/* Unmap pages from GPU address space and free them */
216
	for (i = 0; i < _node->mem.size; i++) {
B
Ben Skeggs 已提交
217 218
		iommu_unmap(imem->domain,
			    (r->offset + i) << imem->iommu_pgshift, PAGE_SIZE);
219 220 221 222
		__free_page(node->pages[i]);
	}

	/* Release area from GPU address space */
B
Ben Skeggs 已提交
223 224 225
	mutex_lock(imem->mm_mutex);
	nvkm_mm_free(imem->mm, &r);
	mutex_unlock(imem->mm_mutex);
226 227
}

228 229
static void *
gk20a_instobj_dtor(struct nvkm_memory *memory)
230
{
231 232
	struct gk20a_instobj *node = gk20a_instobj(memory);
	struct gk20a_instmem *imem = node->imem;
233

B
Ben Skeggs 已提交
234
	if (imem->domain)
235 236 237
		gk20a_instobj_dtor_iommu(node);
	else
		gk20a_instobj_dtor_dma(node);
238

239
	return node;
240 241
}

242 243 244 245 246 247 248 249 250 251 252 253 254
static const struct nvkm_memory_func
gk20a_instobj_func = {
	.dtor = gk20a_instobj_dtor,
	.target = gk20a_instobj_target,
	.addr = gk20a_instobj_addr,
	.size = gk20a_instobj_size,
	.acquire = gk20a_instobj_acquire,
	.release = gk20a_instobj_release,
	.rd32 = gk20a_instobj_rd32,
	.wr32 = gk20a_instobj_wr32,
	.map = gk20a_instobj_map,
};

255
static int
256
gk20a_instobj_ctor_dma(struct gk20a_instmem *imem, u32 npages, u32 align,
B
Ben Skeggs 已提交
257
		       struct gk20a_instobj **_node)
258
{
259
	struct gk20a_instobj_dma *node;
260
	struct nvkm_subdev *subdev = &imem->base.subdev;
261
	struct device *dev = subdev->device->dev;
262

263 264
	if (!(node = kzalloc(sizeof(*node), GFP_KERNEL)))
		return -ENOMEM;
265
	*_node = &node->base;
266

267 268
	node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT,
					&node->handle, GFP_KERNEL,
B
Ben Skeggs 已提交
269
					&imem->attrs);
270
	if (!node->cpuaddr) {
271
		nvkm_error(subdev, "cannot allocate DMA memory\n");
272 273 274 275 276
		return -ENOMEM;
	}

	/* alignment check */
	if (unlikely(node->handle & (align - 1)))
277 278 279
		nvkm_warn(subdev,
			  "memory not aligned as requested: %pad (0x%x)\n",
			  &node->handle, align);
280

281 282 283 284 285
	/* present memory for being mapped using small pages */
	node->r.type = 12;
	node->r.offset = node->handle >> 12;
	node->r.length = (npages << PAGE_SHIFT) >> 12;

286
	node->base.mem.offset = node->handle;
287

288 289
	INIT_LIST_HEAD(&node->base.mem.regions);
	list_add_tail(&node->r.rl_entry, &node->base.mem.regions);
290 291 292 293 294

	return 0;
}

static int
295
gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align,
B
Ben Skeggs 已提交
296
			 struct gk20a_instobj **_node)
297 298
{
	struct gk20a_instobj_iommu *node;
299
	struct nvkm_subdev *subdev = &imem->base.subdev;
300 301 302 303
	struct nvkm_mm_node *r;
	int ret;
	int i;

304 305 306
	if (!(node = kzalloc(sizeof(*node) +
			     sizeof( node->pages[0]) * npages, GFP_KERNEL)))
		return -ENOMEM;
307 308 309 310 311 312 313 314 315 316 317 318 319
	*_node = &node->base;

	/* Allocate backing memory */
	for (i = 0; i < npages; i++) {
		struct page *p = alloc_page(GFP_KERNEL);

		if (p == NULL) {
			ret = -ENOMEM;
			goto free_pages;
		}
		node->pages[i] = p;
	}

B
Ben Skeggs 已提交
320
	mutex_lock(imem->mm_mutex);
321
	/* Reserve area from GPU address space */
B
Ben Skeggs 已提交
322 323 324
	ret = nvkm_mm_head(imem->mm, 0, 1, npages, npages,
			   align >> imem->iommu_pgshift, &r);
	mutex_unlock(imem->mm_mutex);
325
	if (ret) {
326
		nvkm_error(subdev, "virtual space is full!\n");
327 328 329 330 331 332
		goto free_pages;
	}

	/* Map into GPU address space */
	for (i = 0; i < npages; i++) {
		struct page *p = node->pages[i];
B
Ben Skeggs 已提交
333
		u32 offset = (r->offset + i) << imem->iommu_pgshift;
334

B
Ben Skeggs 已提交
335
		ret = iommu_map(imem->domain, offset, page_to_phys(p),
336 337
				PAGE_SIZE, IOMMU_READ | IOMMU_WRITE);
		if (ret < 0) {
338
			nvkm_error(subdev, "IOMMU mapping failure: %d\n", ret);
339 340 341

			while (i-- > 0) {
				offset -= PAGE_SIZE;
B
Ben Skeggs 已提交
342
				iommu_unmap(imem->domain, offset, PAGE_SIZE);
343 344 345 346 347 348
			}
			goto release_area;
		}
	}

	/* Bit 34 tells that an address is to be resolved through the IOMMU */
B
Ben Skeggs 已提交
349
	r->offset |= BIT(34 - imem->iommu_pgshift);
350

351
	node->base.mem.offset = ((u64)r->offset) << imem->iommu_pgshift;
352

353 354
	INIT_LIST_HEAD(&node->base.mem.regions);
	list_add_tail(&r->rl_entry, &node->base.mem.regions);
355 356 357 358

	return 0;

release_area:
B
Ben Skeggs 已提交
359 360 361
	mutex_lock(imem->mm_mutex);
	nvkm_mm_free(imem->mm, &r);
	mutex_unlock(imem->mm_mutex);
362 363 364 365 366 367 368 369 370

free_pages:
	for (i = 0; i < npages && node->pages[i] != NULL; i++)
		__free_page(node->pages[i]);

	return ret;
}

static int
371 372
gk20a_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero,
		  struct nvkm_memory **pmemory)
373
{
374
	struct gk20a_instmem *imem = gk20a_instmem(base);
375
	struct gk20a_instobj *node = NULL;
376
	struct nvkm_subdev *subdev = &imem->base.subdev;
377 378
	int ret;

379
	nvkm_debug(subdev, "%s (%s): size: %x align: %x\n", __func__,
380
		   imem->domain ? "IOMMU" : "DMA", size, align);
381 382

	/* Round size and align to page bounds */
383 384
	size = max(roundup(size, PAGE_SIZE), PAGE_SIZE);
	align = max(roundup(align, PAGE_SIZE), PAGE_SIZE);
385

B
Ben Skeggs 已提交
386
	if (imem->domain)
387 388
		ret = gk20a_instobj_ctor_iommu(imem, size >> PAGE_SHIFT,
					       align, &node);
389
	else
390 391
		ret = gk20a_instobj_ctor_dma(imem, size >> PAGE_SHIFT,
					     align, &node);
392
	*pmemory = node ? &node->memory : NULL;
393 394 395
	if (ret)
		return ret;

396 397
	nvkm_memory_ctor(&gk20a_instobj_func, &node->memory);
	node->imem = imem;
398 399

	/* present memory for being mapped using small pages */
400 401 402
	node->mem.size = size >> 12;
	node->mem.memtype = 0;
	node->mem.page_shift = 12;
403

404
	nvkm_debug(subdev, "alloc size: 0x%x, align: 0x%x, gaddr: 0x%llx\n",
405
		   size, align, node->mem.offset);
406 407 408 409

	return 0;
}

410 411
static void
gk20a_instmem_fini(struct nvkm_instmem *base)
412
{
413
	gk20a_instmem(base)->addr = ~0ULL;
414 415
}

416 417 418 419 420 421 422 423 424 425 426
static const struct nvkm_instmem_func
gk20a_instmem = {
	.fini = gk20a_instmem_fini,
	.memory_new = gk20a_instobj_new,
	.persistent = true,
	.zero = false,
};

int
gk20a_instmem_new(struct nvkm_device *device, int index,
		 struct nvkm_instmem **pimem)
427
{
B
Ben Skeggs 已提交
428
	struct gk20a_instmem *imem;
429

430 431 432
	if (!(imem = kzalloc(sizeof(*imem), GFP_KERNEL)))
		return -ENOMEM;
	nvkm_instmem_ctor(&gk20a_instmem, device, index, &imem->base);
B
Ben Skeggs 已提交
433
	spin_lock_init(&imem->lock);
434
	*pimem = &imem->base;
435

436 437 438 439 440
	if (device->gpu->iommu.domain) {
		imem->domain = device->gpu->iommu.domain;
		imem->mm = device->gpu->iommu.mm;
		imem->iommu_pgshift = device->gpu->iommu.pgshift;
		imem->mm_mutex = &device->gpu->iommu.mutex;
441

442
		nvkm_info(&imem->base.subdev, "using IOMMU\n");
443
	} else {
B
Ben Skeggs 已提交
444
		init_dma_attrs(&imem->attrs);
445 446 447 448
		/*
		 * We will access instmem through PRAMIN and thus do not need a
		 * consistent CPU pointer or kernel mapping
		 */
B
Ben Skeggs 已提交
449 450 451 452
		dma_set_attr(DMA_ATTR_NON_CONSISTENT, &imem->attrs);
		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &imem->attrs);
		dma_set_attr(DMA_ATTR_WRITE_COMBINE, &imem->attrs);
		dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &imem->attrs);
453

454
		nvkm_info(&imem->base.subdev, "using DMA API\n");
455
	}
456

457 458
	return 0;
}