intel_ggtt_fencing.c 27.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 * Copyright © 2008-2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "i915_drv.h"
25
#include "i915_scatterlist.h"
26
#include "i915_pvinfo.h"
27
#include "i915_vgpu.h"
28

D
Daniel Vetter 已提交
29 30 31 32 33 34 35 36 37
/**
 * DOC: fence register handling
 *
 * Important to avoid confusions: "fences" in the i915 driver are not execution
 * fences used to track command completion but hardware detiler objects which
 * wrap a given range of the global GTT. Each platform has only a fairly limited
 * set of these objects.
 *
 * Fences are used to detile GTT memory mappings. They're also connected to the
38 39
 * hardware frontbuffer render tracking and hence interact with frontbuffer
 * compression. Furthermore on older platforms fences are required for tiled
D
Daniel Vetter 已提交
40 41 42 43 44 45 46 47 48 49
 * objects used by the display engine. They can also be used by the render
 * engine - they're required for blitter commands and are optional for render
 * commands. But on gen4+ both display (with the exception of fbc) and rendering
 * have their own tiling state bits and don't need fences.
 *
 * Also note that fences only support X and Y tiling and hence can't be used for
 * the fancier new tiling formats like W, Ys and Yf.
 *
 * Finally note that because fences are such a restricted resource they're
 * dynamically associated with objects. Furthermore fence state is committed to
50 51
 * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
 * explicitly call i915_gem_object_get_fence() to synchronize fencing status
D
Daniel Vetter 已提交
52 53 54 55 56 57 58
 * for cpu access. Also note that some code wants an unfenced view, for those
 * cases the fence can be removed forcefully with i915_gem_object_put_fence().
 *
 * Internally these functions will synchronize with userspace access by removing
 * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
 */

59 60
#define pipelined 0

61 62 63 64 65 66 67 68 69 70
static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence)
{
	return fence->ggtt->vm.i915;
}

static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence)
{
	return fence->ggtt->vm.gt->uncore;
}

71
static void i965_write_fence_reg(struct i915_fence_reg *fence)
72
{
73
	i915_reg_t fence_reg_lo, fence_reg_hi;
74
	int fence_pitch_shift;
75
	u64 val;
76

77
	if (INTEL_GEN(fence_to_i915(fence)) >= 6) {
78 79
		fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
		fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
80
		fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
81

82
	} else {
83 84
		fence_reg_lo = FENCE_REG_965_LO(fence->id);
		fence_reg_hi = FENCE_REG_965_HI(fence->id);
85 86 87
		fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
	}

88
	val = 0;
89 90
	if (fence->tiling) {
		unsigned int stride = fence->stride;
91

92
		GEM_BUG_ON(!IS_ALIGNED(stride, 128));
93

94 95 96
		val = fence->start + fence->size - I965_FENCE_PAGE;
		val <<= 32;
		val |= fence->start;
97
		val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
98
		if (fence->tiling == I915_TILING_Y)
99
			val |= BIT(I965_FENCE_TILING_Y_SHIFT);
100
		val |= I965_FENCE_REG_VALID;
101
	}
102

103
	if (!pipelined) {
104
		struct intel_uncore *uncore = fence_to_uncore(fence);
105

106 107
		/*
		 * To w/a incoherency with non-atomic 64-bit register updates,
108 109 110 111 112 113 114 115
		 * we split the 64-bit update into two 32-bit writes. In order
		 * for a partial fence not to be evaluated between writes, we
		 * precede the update with write to turn off the fence register,
		 * and only enable the fence as the last step.
		 *
		 * For extra levels of paranoia, we make sure each step lands
		 * before applying the next step.
		 */
116 117
		intel_uncore_write_fw(uncore, fence_reg_lo, 0);
		intel_uncore_posting_read_fw(uncore, fence_reg_lo);
118

119 120 121
		intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
		intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
		intel_uncore_posting_read_fw(uncore, fence_reg_lo);
122 123 124
	}
}

125
static void i915_write_fence_reg(struct i915_fence_reg *fence)
126 127 128
{
	u32 val;

129
	val = 0;
130 131 132
	if (fence->tiling) {
		unsigned int stride = fence->stride;
		unsigned int tiling = fence->tiling;
133
		bool is_y_tiled = tiling == I915_TILING_Y;
134

135
		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
136
			stride /= 128;
137
		else
138 139
			stride /= 512;
		GEM_BUG_ON(!is_power_of_2(stride));
140

141
		val = fence->start;
142 143
		if (is_y_tiled)
			val |= BIT(I830_FENCE_TILING_Y_SHIFT);
144
		val |= I915_FENCE_SIZE_BITS(fence->size);
145 146
		val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT;

147
		val |= I830_FENCE_REG_VALID;
148
	}
149

150
	if (!pipelined) {
151
		struct intel_uncore *uncore = fence_to_uncore(fence);
152 153
		i915_reg_t reg = FENCE_REG(fence->id);

154 155
		intel_uncore_write_fw(uncore, reg, val);
		intel_uncore_posting_read_fw(uncore, reg);
156
	}
157 158
}

159
static void i830_write_fence_reg(struct i915_fence_reg *fence)
160
{
161
	u32 val;
162

163
	val = 0;
164 165
	if (fence->tiling) {
		unsigned int stride = fence->stride;
166

167 168
		val = fence->start;
		if (fence->tiling == I915_TILING_Y)
169
			val |= BIT(I830_FENCE_TILING_Y_SHIFT);
170
		val |= I830_FENCE_SIZE_BITS(fence->size);
171
		val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT;
172
		val |= I830_FENCE_REG_VALID;
173
	}
174

175
	if (!pipelined) {
176
		struct intel_uncore *uncore = fence_to_uncore(fence);
177
		i915_reg_t reg = FENCE_REG(fence->id);
178

179 180
		intel_uncore_write_fw(uncore, reg, val);
		intel_uncore_posting_read_fw(uncore, reg);
181
	}
182 183
}

184
static void fence_write(struct i915_fence_reg *fence)
185
{
186 187
	struct drm_i915_private *i915 = fence_to_i915(fence);

188 189
	/*
	 * Previous access through the fence register is marshalled by
190 191
	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
	 * and explicitly managed for internal users.
192
	 */
193

194
	if (IS_GEN(i915, 2))
195
		i830_write_fence_reg(fence);
196
	else if (IS_GEN(i915, 3))
197
		i915_write_fence_reg(fence);
198
	else
199
		i965_write_fence_reg(fence);
200

201 202
	/*
	 * Access through the fenced region afterwards is
203
	 * ordered by the posting reads whilst writing the registers.
204 205 206
	 */
}

207 208 209 210 211
static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
{
	return INTEL_GEN(fence_to_i915(fence)) < 4;
}

212
static int fence_update(struct i915_fence_reg *fence,
213
			struct i915_vma *vma)
214
{
215 216
	struct i915_ggtt *ggtt = fence->ggtt;
	struct intel_uncore *uncore = fence_to_uncore(fence);
217
	intel_wakeref_t wakeref;
218
	struct i915_vma *old;
219
	int ret;
220

221
	fence->tiling = 0;
222
	if (vma) {
223 224
		GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) ||
			   !i915_gem_object_get_tiling(vma->obj));
225

226 227 228
		if (!i915_vma_is_map_and_fenceable(vma))
			return -EINVAL;

229 230 231 232 233 234
		if (gpu_uses_fence_registers(fence)) {
			/* implicit 'unfenced' GPU blits */
			ret = i915_vma_sync(vma);
			if (ret)
				return ret;
		}
235 236 237 238 239

		fence->start = vma->node.start;
		fence->size = vma->fence_size;
		fence->stride = i915_gem_object_get_stride(vma->obj);
		fence->tiling = i915_gem_object_get_tiling(vma->obj);
240
	}
241
	WRITE_ONCE(fence->dirty, false);
242

243 244
	old = xchg(&fence->vma, NULL);
	if (old) {
245
		/* XXX Ideally we would move the waiting to outside the mutex */
246
		ret = i915_active_wait(&fence->active);
247 248
		if (ret) {
			fence->vma = old;
249
			return ret;
250
		}
251 252

		i915_vma_flush_writes(old);
253

254 255
		/*
		 * Ensure that all userspace CPU access is completed before
256 257
		 * stealing the fence.
		 */
258 259 260 261 262
		if (old != vma) {
			GEM_BUG_ON(old->fence != fence);
			i915_vma_revoke_mmap(old);
			old->fence = NULL;
		}
263

264
		list_move(&fence->link, &ggtt->fence_list);
265 266
	}

267 268
	/*
	 * We only need to update the register itself if the device is awake.
269
	 * If the device is currently powered down, we will defer the write
270
	 * to the runtime resume, see intel_ggtt_restore_fences().
271 272 273 274 275
	 *
	 * This only works for removing the fence register, on acquisition
	 * the caller must hold the rpm wakeref. The fence register must
	 * be cleared before we can use any other fences to ensure that
	 * the new fences do not overlap the elided clears, confusing HW.
276
	 */
277
	wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm);
278 279 280
	if (!wakeref) {
		GEM_BUG_ON(vma);
		return 0;
281
	}
282

283
	WRITE_ONCE(fence->vma, vma);
284
	fence_write(fence);
285

286 287
	if (vma) {
		vma->fence = fence;
288
		list_move_tail(&fence->link, &ggtt->fence_list);
289 290
	}

291
	intel_runtime_pm_put(uncore->rpm, wakeref);
292
	return 0;
293 294
}

D
Daniel Vetter 已提交
295
/**
296
 * i915_vma_revoke_fence - force-remove fence for a VMA
297
 * @vma: vma to map linearly (not through a fence reg)
D
Daniel Vetter 已提交
298 299 300 301
 *
 * This function force-removes any fence from the given object, which is useful
 * if the kernel wants to do untiled GTT access.
 */
302
void i915_vma_revoke_fence(struct i915_vma *vma)
303
{
304
	struct i915_fence_reg *fence = vma->fence;
305
	intel_wakeref_t wakeref;
306

307
	lockdep_assert_held(&vma->vm->mutex);
308
	if (!fence)
309 310 311 312 313
		return;

	GEM_BUG_ON(fence->vma != vma);
	GEM_BUG_ON(!i915_active_is_idle(&fence->active));
	GEM_BUG_ON(atomic_read(&fence->pin_count));
314

315 316 317
	fence->tiling = 0;
	WRITE_ONCE(fence->vma, NULL);
	vma->fence = NULL;
318

319 320 321 322 323 324 325 326 327 328 329 330
	/*
	 * Skip the write to HW if and only if the device is currently
	 * suspended.
	 *
	 * If the driver does not currently hold a wakeref (if_in_use == 0),
	 * the device may currently be runtime suspended, or it may be woken
	 * up before the suspend takes place. If the device is not suspended
	 * (powered down) and we skip clearing the fence register, the HW is
	 * left in an undefined state where we may end up with multiple
	 * registers overlapping.
	 */
	with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
331
		fence_write(fence);
332 333
}

334 335 336 337 338
static bool fence_is_active(const struct i915_fence_reg *fence)
{
	return fence->vma && i915_vma_is_active(fence->vma);
}

339
static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
340
{
341 342
	struct i915_fence_reg *active = NULL;
	struct i915_fence_reg *fence, *fn;
343

344
	list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
345 346
		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);

347 348 349 350 351 352 353 354 355 356 357 358
		if (fence == active) /* now seen this fence twice */
			active = ERR_PTR(-EAGAIN);

		/* Prefer idle fences so we do not have to wait on the GPU */
		if (active != ERR_PTR(-EAGAIN) && fence_is_active(fence)) {
			if (!active)
				active = fence;

			list_move_tail(&fence->link, &ggtt->fence_list);
			continue;
		}

359
		if (atomic_read(&fence->pin_count))
360 361
			continue;

362
		return fence;
363 364 365
	}

	/* Wait for completion of pending flips which consume fences */
366
	if (intel_has_pending_fb_unpin(ggtt->vm.i915))
367 368 369 370 371
		return ERR_PTR(-EAGAIN);

	return ERR_PTR(-EDEADLK);
}

372
int __i915_vma_pin_fence(struct i915_vma *vma)
373 374 375 376 377 378
{
	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
	struct i915_fence_reg *fence;
	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
	int err;

379 380
	lockdep_assert_held(&vma->vm->mutex);

381 382 383 384 385 386 387 388 389 390
	/* Just update our place in the LRU if our fence is getting reused. */
	if (vma->fence) {
		fence = vma->fence;
		GEM_BUG_ON(fence->vma != vma);
		atomic_inc(&fence->pin_count);
		if (!fence->dirty) {
			list_move_tail(&fence->link, &ggtt->fence_list);
			return 0;
		}
	} else if (set) {
391
		fence = fence_find(ggtt);
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
		if (IS_ERR(fence))
			return PTR_ERR(fence);

		GEM_BUG_ON(atomic_read(&fence->pin_count));
		atomic_inc(&fence->pin_count);
	} else {
		return 0;
	}

	err = fence_update(fence, set);
	if (err)
		goto out_unpin;

	GEM_BUG_ON(fence->vma != set);
	GEM_BUG_ON(vma->fence != (set ? fence : NULL));

	if (set)
		return 0;

out_unpin:
	atomic_dec(&fence->pin_count);
	return err;
}

416
/**
417
 * i915_vma_pin_fence - set up fencing for a vma
418
 * @vma: vma to map through a fence reg
419 420 421 422 423 424 425 426 427 428
 *
 * When mapping objects through the GTT, userspace wants to be able to write
 * to them without having to worry about swizzling if the object is tiled.
 * This function walks the fence regs looking for a free one for @obj,
 * stealing one if it can't find any.
 *
 * It then sets up the reg based on the object's properties: address, pitch
 * and tiling format.
 *
 * For an untiled surface, this removes any existing fence.
D
Daniel Vetter 已提交
429 430 431 432
 *
 * Returns:
 *
 * 0 on success, negative error code on failure.
433
 */
434
int i915_vma_pin_fence(struct i915_vma *vma)
435
{
436
	int err;
437

438 439 440
	if (!vma->fence && !i915_gem_object_is_tiled(vma->obj))
		return 0;

441 442
	/*
	 * Note that we revoke fences on runtime suspend. Therefore the user
443 444
	 * must keep the device awake whilst using the fence.
	 */
445
	assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm);
446 447
	GEM_BUG_ON(!i915_vma_is_pinned(vma));
	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
448

449
	err = mutex_lock_interruptible(&vma->vm->mutex);
450
	if (err)
451
		return err;
452

453 454
	err = __i915_vma_pin_fence(vma);
	mutex_unlock(&vma->vm->mutex);
455 456

	return err;
457 458
}

459 460
/**
 * i915_reserve_fence - Reserve a fence for vGPU
461
 * @ggtt: Global GTT
462 463 464 465
 *
 * This function walks the fence regs looking for a free one and remove
 * it from the fence_list. It is used to reserve fence for vGPU to use.
 */
466
struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt)
467
{
468
	struct i915_fence_reg *fence;
469 470 471
	int count;
	int ret;

472
	lockdep_assert_held(&ggtt->vm.mutex);
473 474 475

	/* Keep at least one fence available for the display engine. */
	count = 0;
476 477
	list_for_each_entry(fence, &ggtt->fence_list, link)
		count += !atomic_read(&fence->pin_count);
478 479 480
	if (count <= 1)
		return ERR_PTR(-ENOSPC);

481
	fence = fence_find(ggtt);
482 483 484 485 486 487 488 489 490 491 492
	if (IS_ERR(fence))
		return fence;

	if (fence->vma) {
		/* Force-remove fence from VMA */
		ret = fence_update(fence, NULL);
		if (ret)
			return ERR_PTR(ret);
	}

	list_del(&fence->link);
493

494 495 496 497 498 499 500 501 502
	return fence;
}

/**
 * i915_unreserve_fence - Reclaim a reserved fence
 * @fence: the fence reg
 *
 * This function add a reserved fence register from vGPU to the fence_list.
 */
503
void i915_unreserve_fence(struct i915_fence_reg *fence)
504
{
505
	struct i915_ggtt *ggtt = fence->ggtt;
506 507

	lockdep_assert_held(&ggtt->vm.mutex);
508

509
	list_add(&fence->link, &ggtt->fence_list);
510 511
}

D
Daniel Vetter 已提交
512
/**
513
 * intel_ggtt_restore_fences - restore fence state
514
 * @ggtt: Global GTT
D
Daniel Vetter 已提交
515 516
 *
 * Restore the hw fence state to match the software tracking again, to be called
517 518
 * after a gpu reset and on resume. Note that on runtime suspend we only cancel
 * the fences, to be reacquired by the user later.
D
Daniel Vetter 已提交
519
 */
520
void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
521 522 523
{
	int i;

524 525
	for (i = 0; i < ggtt->num_fences; i++)
		fence_write(&ggtt->fence_regs[i]);
526
}
527 528

/**
529
 * DOC: tiling swizzling details
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
 *
 * The idea behind tiling is to increase cache hit rates by rearranging
 * pixel data so that a group of pixel accesses are in the same cacheline.
 * Performance improvement from doing this on the back/depth buffer are on
 * the order of 30%.
 *
 * Intel architectures make this somewhat more complicated, though, by
 * adjustments made to addressing of data when the memory is in interleaved
 * mode (matched pairs of DIMMS) to improve memory bandwidth.
 * For interleaved memory, the CPU sends every sequential 64 bytes
 * to an alternate memory channel so it can get the bandwidth from both.
 *
 * The GPU also rearranges its accesses for increased bandwidth to interleaved
 * memory, and it matches what the CPU does for non-tiled.  However, when tiled
 * it does it a little differently, since one walks addresses not just in the
 * X direction but also Y.  So, along with alternating channels when bit
 * 6 of the address flips, it also alternates when other bits flip --  Bits 9
 * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
 * are common to both the 915 and 965-class hardware.
 *
 * The CPU also sometimes XORs in higher bits as well, to improve
 * bandwidth doing strided access like we do so frequently in graphics.  This
 * is called "Channel XOR Randomization" in the MCH documentation.  The result
 * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
 * decode.
 *
 * All of this bit 6 XORing has an effect on our memory management,
 * as we need to make sure that the 3d driver can correctly address object
 * contents.
 *
 * If we don't have interleaved memory, all tiling is safe and no swizzling is
 * required.
 *
 * When bit 17 is XORed in, we simply refuse to tile at all.  Bit
564
 * 17 is not just a page offset, so as we page an object out and back in,
565 566 567 568 569 570 571 572 573 574 575 576
 * individual pages in it will have different bit 17 addresses, resulting in
 * each 64 bytes being swapped with its neighbor!
 *
 * Otherwise, if interleaved, we have to tell the 3d driver what the address
 * swizzling it needs to do is, since it's writing with the CPU to the pages
 * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
 * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
 * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
 * to match what the GPU expects.
 */

/**
577
 * detect_bit_6_swizzle - detect bit 6 swizzling pattern
578
 * @ggtt: Global GGTT
579
 *
580 581 582
 * Detects bit 6 swizzling of address lookup between IGD access and CPU
 * access through main memory.
 */
583
static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
584
{
585 586
	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
	struct drm_i915_private *i915 = ggtt->vm.i915;
587 588
	u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
	u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
589

590
	if (INTEL_GEN(i915) >= 8 || IS_VALLEYVIEW(i915)) {
591 592 593 594 595 596 597 598 599
		/*
		 * On BDW+, swizzling is not used. We leave the CPU memory
		 * controller in charge of optimizing memory accesses without
		 * the extra address manipulation GPU side.
		 *
		 * VLV and CHV don't have GPU swizzling.
		 */
		swizzle_x = I915_BIT_6_SWIZZLE_NONE;
		swizzle_y = I915_BIT_6_SWIZZLE_NONE;
600 601 602
	} else if (INTEL_GEN(i915) >= 6) {
		if (i915->preserve_bios_swizzle) {
			if (intel_uncore_read(uncore, DISP_ARB_CTL) &
603 604 605 606 607 608 609 610
			    DISP_TILE_SURFACE_SWIZZLING) {
				swizzle_x = I915_BIT_6_SWIZZLE_9_10;
				swizzle_y = I915_BIT_6_SWIZZLE_9;
			} else {
				swizzle_x = I915_BIT_6_SWIZZLE_NONE;
				swizzle_y = I915_BIT_6_SWIZZLE_NONE;
			}
		} else {
611
			u32 dimm_c0, dimm_c1;
612 613
			dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
			dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
614 615
			dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
			dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
616 617
			/*
			 * Enable swizzling when the channels are populated
618 619 620
			 * with identically sized dimms. We don't need to check
			 * the 3rd channel because no cpu with gpu attached
			 * ships in that configuration. Also, swizzling only
621 622
			 * makes sense for 2 channels anyway.
			 */
623 624 625 626 627 628 629 630
			if (dimm_c0 == dimm_c1) {
				swizzle_x = I915_BIT_6_SWIZZLE_9_10;
				swizzle_y = I915_BIT_6_SWIZZLE_9;
			} else {
				swizzle_x = I915_BIT_6_SWIZZLE_NONE;
				swizzle_y = I915_BIT_6_SWIZZLE_NONE;
			}
		}
631 632 633
	} else if (IS_GEN(i915, 5)) {
		/*
		 * On Ironlake whatever DRAM config, GPU always do
634 635 636 637
		 * same swizzling setup.
		 */
		swizzle_x = I915_BIT_6_SWIZZLE_9_10;
		swizzle_y = I915_BIT_6_SWIZZLE_9;
638 639 640
	} else if (IS_GEN(i915, 2)) {
		/*
		 * As far as we know, the 865 doesn't have these bit 6
641 642 643 644
		 * swizzling issues.
		 */
		swizzle_x = I915_BIT_6_SWIZZLE_NONE;
		swizzle_y = I915_BIT_6_SWIZZLE_NONE;
645 646 647
	} else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) {
		/*
		 * The 965, G33, and newer, have a very flexible memory
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
		 * configuration.  It will enable dual-channel mode
		 * (interleaving) on as much memory as it can, and the GPU
		 * will additionally sometimes enable different bit 6
		 * swizzling for tiled objects from the CPU.
		 *
		 * Here's what I found on the G965:
		 *    slot fill         memory size  swizzling
		 * 0A   0B   1A   1B    1-ch   2-ch
		 * 512  0    0    0     512    0     O
		 * 512  0    512  0     16     1008  X
		 * 512  0    0    512   16     1008  X
		 * 0    512  0    512   16     1008  X
		 * 1024 1024 1024 0     2048   1024  O
		 *
		 * We could probably detect this based on either the DRB
		 * matching, which was the case for the swizzling required in
		 * the table above, or from the 1-ch value being less than
		 * the minimum size of a rank.
		 *
		 * Reports indicate that the swizzling actually
		 * varies depending upon page placement inside the
		 * channels, i.e. we see swizzled pages where the
		 * banks of memory are paired and unswizzled on the
		 * uneven portion, so leave that as unknown.
		 */
673 674
		if (intel_uncore_read(uncore, C0DRB3) ==
		    intel_uncore_read(uncore, C1DRB3)) {
675 676 677 678
			swizzle_x = I915_BIT_6_SWIZZLE_9_10;
			swizzle_y = I915_BIT_6_SWIZZLE_9;
		}
	} else {
679
		u32 dcc = intel_uncore_read(uncore, DCC);
680

681 682
		/*
		 * On 9xx chipsets, channel interleave by the CPU is
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
		 * determined by DCC.  For single-channel, neither the CPU
		 * nor the GPU do swizzling.  For dual channel interleaved,
		 * the GPU's interleave is bit 9 and 10 for X tiled, and bit
		 * 9 for Y tiled.  The CPU's interleave is independent, and
		 * can be based on either bit 11 (haven't seen this yet) or
		 * bit 17 (common).
		 */
		switch (dcc & DCC_ADDRESSING_MODE_MASK) {
		case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
		case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
			swizzle_x = I915_BIT_6_SWIZZLE_NONE;
			swizzle_y = I915_BIT_6_SWIZZLE_NONE;
			break;
		case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
			if (dcc & DCC_CHANNEL_XOR_DISABLE) {
698 699
				/*
				 * This is the base swizzling by the GPU for
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
				 * tiled buffers.
				 */
				swizzle_x = I915_BIT_6_SWIZZLE_9_10;
				swizzle_y = I915_BIT_6_SWIZZLE_9;
			} else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
				/* Bit 11 swizzling by the CPU in addition. */
				swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
				swizzle_y = I915_BIT_6_SWIZZLE_9_11;
			} else {
				/* Bit 17 swizzling by the CPU in addition. */
				swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
				swizzle_y = I915_BIT_6_SWIZZLE_9_17;
			}
			break;
		}

		/* check for L-shaped memory aka modified enhanced addressing */
717 718
		if (IS_GEN(i915, 4) &&
		    !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
719 720
			swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
			swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
721 722 723
		}

		if (dcc == 0xffffffff) {
724
			drm_err(&i915->drm, "Couldn't read from MCHBAR.  "
725 726 727 728 729 730
				  "Disabling tiling.\n");
			swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
			swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
		}
	}

731 732
	if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN ||
	    swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
733 734
		/*
		 * Userspace likes to explode if it sees unknown swizzling,
735 736 737 738 739 740 741 742
		 * so lie. We will finish the lie when reporting through
		 * the get-tiling-ioctl by reporting the physical swizzle
		 * mode as unknown instead.
		 *
		 * As we don't strictly know what the swizzling is, it may be
		 * bit17 dependent, and so we need to also prevent the pages
		 * from being moved.
		 */
743
		i915->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
744 745 746 747
		swizzle_x = I915_BIT_6_SWIZZLE_NONE;
		swizzle_y = I915_BIT_6_SWIZZLE_NONE;
	}

748 749
	i915->ggtt.bit_6_swizzle_x = swizzle_x;
	i915->ggtt.bit_6_swizzle_y = swizzle_y;
750 751
}

752
/*
753 754 755 756
 * Swap every 64 bytes of this page around, to account for it having a new
 * bit 17 of its physical address and therefore being interpreted differently
 * by the GPU.
 */
757
static void swizzle_page(struct page *page)
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
{
	char temp[64];
	char *vaddr;
	int i;

	vaddr = kmap(page);

	for (i = 0; i < PAGE_SIZE; i += 128) {
		memcpy(temp, &vaddr[i], 64);
		memcpy(&vaddr[i], &vaddr[i + 64], 64);
		memcpy(&vaddr[i + 64], temp, 64);
	}

	kunmap(page);
}

774 775 776
/**
 * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
 * @obj: i915 GEM buffer object
777
 * @pages: the scattergather list of physical pages
778 779 780 781 782 783 784 785 786
 *
 * This function fixes up the swizzling in case any page frame number for this
 * object has changed in bit 17 since that state has been saved with
 * i915_gem_object_save_bit_17_swizzle().
 *
 * This is called when pinning backing storage again, since the kernel is free
 * to move unpinned backing storage around (either by directly moving pages or
 * by swapping them out and back in again).
 */
787
void
788 789
i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
				  struct sg_table *pages)
790
{
791 792
	struct sgt_iter sgt_iter;
	struct page *page;
793 794 795 796 797 798
	int i;

	if (obj->bit_17 == NULL)
		return;

	i = 0;
799
	for_each_sgt_page(page, sgt_iter, pages) {
800
		char new_bit_17 = page_to_phys(page) >> 17;
801
		if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) {
802
			swizzle_page(page);
803 804 805 806 807 808
			set_page_dirty(page);
		}
		i++;
	}
}

809 810 811
/**
 * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
 * @obj: i915 GEM buffer object
812
 * @pages: the scattergather list of physical pages
813 814 815 816 817
 *
 * This function saves the bit 17 of each page frame number so that swizzling
 * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
 * be called before the backing storage can be unpinned.
 */
818
void
819 820
i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
				    struct sg_table *pages)
821
{
822
	const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
823 824
	struct sgt_iter sgt_iter;
	struct page *page;
825 826 827
	int i;

	if (obj->bit_17 == NULL) {
828
		obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL);
829 830 831 832 833 834 835 836
		if (obj->bit_17 == NULL) {
			DRM_ERROR("Failed to allocate memory for bit 17 "
				  "record\n");
			return;
		}
	}

	i = 0;
837

838
	for_each_sgt_page(page, sgt_iter, pages) {
839
		if (page_to_phys(page) & (1 << 17))
840 841 842 843 844 845
			__set_bit(i, obj->bit_17);
		else
			__clear_bit(i, obj->bit_17);
		i++;
	}
}
846

847
void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
848 849
{
	struct drm_i915_private *i915 = ggtt->vm.i915;
850
	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
851 852 853 854 855
	int num_fences;
	int i;

	INIT_LIST_HEAD(&ggtt->fence_list);
	INIT_LIST_HEAD(&ggtt->userfault_list);
856
	intel_wakeref_auto_init(&ggtt->userfault_wakeref, uncore->rpm);
857

858
	detect_bit_6_swizzle(ggtt);
859

860 861 862 863
	if (!i915_ggtt_has_aperture(ggtt))
		num_fences = 0;
	else if (INTEL_GEN(i915) >= 7 &&
		 !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)))
864 865 866 867 868 869 870 871 872
		num_fences = 32;
	else if (INTEL_GEN(i915) >= 4 ||
		 IS_I945G(i915) || IS_I945GM(i915) ||
		 IS_G33(i915) || IS_PINEVIEW(i915))
		num_fences = 16;
	else
		num_fences = 8;

	if (intel_vgpu_active(i915))
873
		num_fences = intel_uncore_read(uncore,
874
					       vgtif_reg(avail_rs.fence_num));
875 876 877 878 879
	ggtt->fence_regs = kcalloc(num_fences,
				   sizeof(*ggtt->fence_regs),
				   GFP_KERNEL);
	if (!ggtt->fence_regs)
		num_fences = 0;
880 881 882 883 884

	/* Initialize fence registers to zero */
	for (i = 0; i < num_fences; i++) {
		struct i915_fence_reg *fence = &ggtt->fence_regs[i];

885
		i915_active_init(&fence->active, NULL, NULL);
886
		fence->ggtt = ggtt;
887 888 889 890 891
		fence->id = i;
		list_add_tail(&fence->link, &ggtt->fence_list);
	}
	ggtt->num_fences = num_fences;

892
	intel_ggtt_restore_fences(ggtt);
893
}
894

895 896
void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
{
897 898 899 900 901 902 903 904
	int i;

	for (i = 0; i < ggtt->num_fences; i++) {
		struct i915_fence_reg *fence = &ggtt->fence_regs[i];

		i915_active_fini(&fence->active);
	}

905 906 907
	kfree(ggtt->fence_regs);
}

908 909 910 911 912 913
void intel_gt_init_swizzling(struct intel_gt *gt)
{
	struct drm_i915_private *i915 = gt->i915;
	struct intel_uncore *uncore = gt->uncore;

	if (INTEL_GEN(i915) < 5 ||
914
	    i915->ggtt.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
915 916
		return;

917
	intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING);
918 919 920 921

	if (IS_GEN(i915, 5))
		return;

922
	intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL);
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938

	if (IS_GEN(i915, 6))
		intel_uncore_write(uncore,
				   ARB_MODE,
				   _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
	else if (IS_GEN(i915, 7))
		intel_uncore_write(uncore,
				   ARB_MODE,
				   _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
	else if (IS_GEN(i915, 8))
		intel_uncore_write(uncore,
				   GAMTARBMODE,
				   _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
	else
		MISSING_CASE(INTEL_GEN(i915));
}