intel_ringbuffer.c 83.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright © 2008-2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Zou Nan hai <nanhai.zou@intel.com>
 *    Xiang Hai hao<haihao.xiang@intel.com>
 *
 */

30
#include <linux/log2.h>
31
#include <drm/drmP.h>
32
#include "i915_drv.h"
33
#include <drm/i915_drm.h>
34
#include "i915_trace.h"
35
#include "intel_drv.h"
36

37 38 39 40 41
/* Rough estimate of the typical request size, performing a flush,
 * set-context and then emitting the batch.
 */
#define LEGACY_REQUEST_SIZE 200

42
int __intel_ring_space(int head, int tail, int size)
43
{
44 45
	int space = head - tail;
	if (space <= 0)
46
		space += size;
47
	return space - I915_RING_FREE_SPACE;
48 49
}

50 51 52 53 54 55 56 57 58 59 60
void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
{
	if (ringbuf->last_retired_head != -1) {
		ringbuf->head = ringbuf->last_retired_head;
		ringbuf->last_retired_head = -1;
	}

	ringbuf->space = __intel_ring_space(ringbuf->head & HEAD_ADDR,
					    ringbuf->tail, ringbuf->size);
}

61
static void __intel_ring_advance(struct intel_engine_cs *engine)
62
{
63
	struct intel_ringbuffer *ringbuf = engine->buffer;
64
	ringbuf->tail &= ringbuf->size - 1;
65
	engine->write_tail(engine, ringbuf->tail);
66 67
}

68
static int
69
gen2_render_ring_flush(struct drm_i915_gem_request *req,
70 71 72
		       u32	invalidate_domains,
		       u32	flush_domains)
{
73
	struct intel_engine_cs *engine = req->engine;
74 75 76 77
	u32 cmd;
	int ret;

	cmd = MI_FLUSH;
78
	if (((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER) == 0)
79 80 81 82 83
		cmd |= MI_NO_WRITE_FLUSH;

	if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER)
		cmd |= MI_READ_FLUSH;

84
	ret = intel_ring_begin(req, 2);
85 86 87
	if (ret)
		return ret;

88 89 90
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
91 92 93 94 95

	return 0;
}

static int
96
gen4_render_ring_flush(struct drm_i915_gem_request *req,
97 98
		       u32	invalidate_domains,
		       u32	flush_domains)
99
{
100
	struct intel_engine_cs *engine = req->engine;
101
	u32 cmd;
102
	int ret;
103

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
	/*
	 * read/write caches:
	 *
	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
	 * also flushed at 2d versus 3d pipeline switches.
	 *
	 * read-only caches:
	 *
	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
	 * MI_READ_FLUSH is set, and is always flushed on 965.
	 *
	 * I915_GEM_DOMAIN_COMMAND may not exist?
	 *
	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
	 * invalidated when MI_EXE_FLUSH is set.
	 *
	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
	 * invalidated with every MI_FLUSH.
	 *
	 * TLBs:
	 *
	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
	 * are flushed at any MI_FLUSH.
	 */

	cmd = MI_FLUSH | MI_NO_WRITE_FLUSH;
133
	if ((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER)
134 135 136
		cmd &= ~MI_NO_WRITE_FLUSH;
	if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
		cmd |= MI_EXE_FLUSH;
137

138
	if (invalidate_domains & I915_GEM_DOMAIN_COMMAND &&
139
	    (IS_G4X(req->i915) || IS_GEN5(req->i915)))
140
		cmd |= MI_INVALIDATE_ISP;
141

142
	ret = intel_ring_begin(req, 2);
143 144
	if (ret)
		return ret;
145

146 147 148
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
149 150

	return 0;
151 152
}

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
/**
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 * implementing two workarounds on gen6.  From section 1.4.7.1
 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 *
 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 * produced by non-pipelined state commands), software needs to first
 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 * 0.
 *
 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 *
 * And the workaround for these two requires this workaround first:
 *
 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 * BEFORE the pipe-control with a post-sync op and no write-cache
 * flushes.
 *
 * And this last workaround is tricky because of the requirements on
 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 * volume 2 part 1:
 *
 *     "1 of the following must also be set:
 *      - Render Target Cache Flush Enable ([12] of DW1)
 *      - Depth Cache Flush Enable ([0] of DW1)
 *      - Stall at Pixel Scoreboard ([1] of DW1)
 *      - Depth Stall ([13] of DW1)
 *      - Post-Sync Operation ([13] of DW1)
 *      - Notify Enable ([8] of DW1)"
 *
 * The cache flushes require the workaround flush that triggered this
 * one, so we can't use it.  Depth stall would trigger the same.
 * Post-sync nonzero is what triggered this second workaround, so we
 * can't use that one either.  Notify enable is IRQs, which aren't
 * really our business.  That leaves only stall at scoreboard.
 */
static int
191
intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
192
{
193
	struct intel_engine_cs *engine = req->engine;
194
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
195 196
	int ret;

197
	ret = intel_ring_begin(req, 6);
198 199 200
	if (ret)
		return ret;

201 202
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
203
			PIPE_CONTROL_STALL_AT_SCOREBOARD);
204 205 206 207 208
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0); /* low dword */
	intel_ring_emit(engine, 0); /* high dword */
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
209

210
	ret = intel_ring_begin(req, 6);
211 212 213
	if (ret)
		return ret;

214 215 216 217 218 219 220
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_QW_WRITE);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
221 222 223 224 225

	return 0;
}

static int
226 227
gen6_render_ring_flush(struct drm_i915_gem_request *req,
		       u32 invalidate_domains, u32 flush_domains)
228
{
229
	struct intel_engine_cs *engine = req->engine;
230
	u32 flags = 0;
231
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
232 233
	int ret;

234
	/* Force SNB workarounds for PIPE_CONTROL flushes */
235
	ret = intel_emit_post_sync_nonzero_flush(req);
236 237 238
	if (ret)
		return ret;

239 240 241 242
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
243 244 245 246 247 248 249
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		/*
		 * Ensure that any following seqno writes only happen
		 * when the render cache is indeed flushed.
		 */
250
		flags |= PIPE_CONTROL_CS_STALL;
251 252 253 254 255 256 257 258 259 260 261
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		/*
		 * TLB invalidate requires a post-sync write.
		 */
262
		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
263
	}
264

265
	ret = intel_ring_begin(req, 4);
266 267 268
	if (ret)
		return ret;

269 270 271 272 273
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
274 275 276 277

	return 0;
}

278
static int
279
gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
280
{
281
	struct intel_engine_cs *engine = req->engine;
282 283
	int ret;

284
	ret = intel_ring_begin(req, 4);
285 286 287
	if (ret)
		return ret;

288 289
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
290
			      PIPE_CONTROL_STALL_AT_SCOREBOARD);
291 292 293
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
294 295 296 297

	return 0;
}

298
static int
299
gen7_render_ring_flush(struct drm_i915_gem_request *req,
300 301
		       u32 invalidate_domains, u32 flush_domains)
{
302
	struct intel_engine_cs *engine = req->engine;
303
	u32 flags = 0;
304
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
305 306
	int ret;

307 308 309 310 311 312 313 314 315 316
	/*
	 * Ensure that any following seqno writes only happen when the render
	 * cache is indeed flushed.
	 *
	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
	 * don't try to be clever and just set it unconditionally.
	 */
	flags |= PIPE_CONTROL_CS_STALL;

317 318 319 320 321 322 323
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
324
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
325
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
326 327 328 329 330 331 332 333
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
334
		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
335 336 337 338
		/*
		 * TLB invalidate requires a post-sync write.
		 */
		flags |= PIPE_CONTROL_QW_WRITE;
339
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
340

341 342
		flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;

343 344 345
		/* Workaround: we must issue a pipe_control with CS-stall bit
		 * set before a pipe_control command that has the state cache
		 * invalidate bit set. */
346
		gen7_render_ring_cs_stall_wa(req);
347 348
	}

349
	ret = intel_ring_begin(req, 4);
350 351 352
	if (ret)
		return ret;

353 354 355 356 357
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
358 359 360 361

	return 0;
}

362
static int
363
gen8_emit_pipe_control(struct drm_i915_gem_request *req,
364 365
		       u32 flags, u32 scratch_addr)
{
366
	struct intel_engine_cs *engine = req->engine;
367 368
	int ret;

369
	ret = intel_ring_begin(req, 6);
370 371 372
	if (ret)
		return ret;

373 374 375 376 377 378 379
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
380 381 382 383

	return 0;
}

B
Ben Widawsky 已提交
384
static int
385
gen8_render_ring_flush(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
386 387 388
		       u32 invalidate_domains, u32 flush_domains)
{
	u32 flags = 0;
389
	u32 scratch_addr = req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
390
	int ret;
B
Ben Widawsky 已提交
391 392 393 394 395 396

	flags |= PIPE_CONTROL_CS_STALL;

	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
397
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
398
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
B
Ben Widawsky 已提交
399 400 401 402 403 404 405 406 407 408
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
409 410

		/* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
411
		ret = gen8_emit_pipe_control(req,
412 413 414 415 416
					     PIPE_CONTROL_CS_STALL |
					     PIPE_CONTROL_STALL_AT_SCOREBOARD,
					     0);
		if (ret)
			return ret;
B
Ben Widawsky 已提交
417 418
	}

419
	return gen8_emit_pipe_control(req, flags, scratch_addr);
B
Ben Widawsky 已提交
420 421
}

422
static void ring_write_tail(struct intel_engine_cs *engine,
423
			    u32 value)
424
{
425
	struct drm_i915_private *dev_priv = engine->i915;
426
	I915_WRITE_TAIL(engine, value);
427 428
}

429
u64 intel_ring_get_active_head(struct intel_engine_cs *engine)
430
{
431
	struct drm_i915_private *dev_priv = engine->i915;
432
	u64 acthd;
433

434
	if (INTEL_GEN(dev_priv) >= 8)
435 436
		acthd = I915_READ64_2x32(RING_ACTHD(engine->mmio_base),
					 RING_ACTHD_UDW(engine->mmio_base));
437
	else if (INTEL_GEN(dev_priv) >= 4)
438
		acthd = I915_READ(RING_ACTHD(engine->mmio_base));
439 440 441 442
	else
		acthd = I915_READ(ACTHD);

	return acthd;
443 444
}

445
static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
446
{
447
	struct drm_i915_private *dev_priv = engine->i915;
448 449 450
	u32 addr;

	addr = dev_priv->status_page_dmah->busaddr;
451
	if (INTEL_GEN(dev_priv) >= 4)
452 453 454 455
		addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0;
	I915_WRITE(HWS_PGA, addr);
}

456
static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
457
{
458
	struct drm_i915_private *dev_priv = engine->i915;
459
	i915_reg_t mmio;
460 461 462 463

	/* The ring status page addresses are no longer next to the rest of
	 * the ring registers as of gen7.
	 */
464
	if (IS_GEN7(dev_priv)) {
465
		switch (engine->id) {
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
		case RCS:
			mmio = RENDER_HWS_PGA_GEN7;
			break;
		case BCS:
			mmio = BLT_HWS_PGA_GEN7;
			break;
		/*
		 * VCS2 actually doesn't exist on Gen7. Only shut up
		 * gcc switch check warning
		 */
		case VCS2:
		case VCS:
			mmio = BSD_HWS_PGA_GEN7;
			break;
		case VECS:
			mmio = VEBOX_HWS_PGA_GEN7;
			break;
		}
484
	} else if (IS_GEN6(dev_priv)) {
485
		mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
486 487
	} else {
		/* XXX: gen8 returns to sanity */
488
		mmio = RING_HWS_PGA(engine->mmio_base);
489 490
	}

491
	I915_WRITE(mmio, (u32)engine->status_page.gfx_addr);
492 493 494 495 496 497 498 499 500
	POSTING_READ(mmio);

	/*
	 * Flush the TLB for this page
	 *
	 * FIXME: These two bits have disappeared on gen8, so a question
	 * arises: do we still need this and if so how should we go about
	 * invalidating the TLB?
	 */
501
	if (IS_GEN(dev_priv, 6, 7)) {
502
		i915_reg_t reg = RING_INSTPM(engine->mmio_base);
503 504

		/* ring should be idle before issuing a sync flush*/
505
		WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
506 507 508 509

		I915_WRITE(reg,
			   _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
					      INSTPM_SYNC_FLUSH));
510 511 512
		if (intel_wait_for_register(dev_priv,
					    reg, INSTPM_SYNC_FLUSH, 0,
					    1000))
513
			DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
514
				  engine->name);
515 516 517
	}
}

518
static bool stop_ring(struct intel_engine_cs *engine)
519
{
520
	struct drm_i915_private *dev_priv = engine->i915;
521

522
	if (!IS_GEN2(dev_priv)) {
523
		I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
524 525 526 527 528
		if (intel_wait_for_register(dev_priv,
					    RING_MI_MODE(engine->mmio_base),
					    MODE_IDLE,
					    MODE_IDLE,
					    1000)) {
529 530
			DRM_ERROR("%s : timed out trying to stop ring\n",
				  engine->name);
531 532 533 534
			/* Sometimes we observe that the idle flag is not
			 * set even though the ring is empty. So double
			 * check before giving up.
			 */
535
			if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine))
536
				return false;
537 538
		}
	}
539

540 541 542
	I915_WRITE_CTL(engine, 0);
	I915_WRITE_HEAD(engine, 0);
	engine->write_tail(engine, 0);
543

544
	if (!IS_GEN2(dev_priv)) {
545 546
		(void)I915_READ_CTL(engine);
		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
547
	}
548

549
	return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
550
}
551

552 553 554 555 556
void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
{
	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
}

557
static int init_ring_common(struct intel_engine_cs *engine)
558
{
559
	struct drm_i915_private *dev_priv = engine->i915;
560
	struct intel_ringbuffer *ringbuf = engine->buffer;
561
	struct drm_i915_gem_object *obj = ringbuf->obj;
562 563
	int ret = 0;

564
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
565

566
	if (!stop_ring(engine)) {
567
		/* G45 ring initialization often fails to reset head to zero */
568 569
		DRM_DEBUG_KMS("%s head not reset to zero "
			      "ctl %08x head %08x tail %08x start %08x\n",
570 571 572 573 574
			      engine->name,
			      I915_READ_CTL(engine),
			      I915_READ_HEAD(engine),
			      I915_READ_TAIL(engine),
			      I915_READ_START(engine));
575

576
		if (!stop_ring(engine)) {
577 578
			DRM_ERROR("failed to set %s head to zero "
				  "ctl %08x head %08x tail %08x start %08x\n",
579 580 581 582 583
				  engine->name,
				  I915_READ_CTL(engine),
				  I915_READ_HEAD(engine),
				  I915_READ_TAIL(engine),
				  I915_READ_START(engine));
584 585
			ret = -EIO;
			goto out;
586
		}
587 588
	}

589
	if (I915_NEED_GFX_HWS(dev_priv))
590
		intel_ring_setup_status_page(engine);
591
	else
592
		ring_setup_phys_status_page(engine);
593

594
	/* Enforce ordering by reading HEAD register back */
595
	I915_READ_HEAD(engine);
596

597 598 599 600
	/* Initialize the ring. This must happen _after_ we've cleared the ring
	 * registers with the above sequence (the readback of the HEAD registers
	 * also enforces ordering), otherwise the hw might lose the new ring
	 * register values. */
601
	I915_WRITE_START(engine, i915_gem_obj_ggtt_offset(obj));
602 603

	/* WaClearRingBufHeadRegAtInit:ctg,elk */
604
	if (I915_READ_HEAD(engine))
605
		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
606 607 608
			  engine->name, I915_READ_HEAD(engine));
	I915_WRITE_HEAD(engine, 0);
	(void)I915_READ_HEAD(engine);
609

610
	I915_WRITE_CTL(engine,
611
			((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES)
612
			| RING_VALID);
613 614

	/* If the head is still not zero, the ring is dead */
615 616 617
	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
		     I915_READ_START(engine) == i915_gem_obj_ggtt_offset(obj) &&
		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
618
		DRM_ERROR("%s initialization failed "
619
			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08lx]\n",
620 621 622 623 624 625
			  engine->name,
			  I915_READ_CTL(engine),
			  I915_READ_CTL(engine) & RING_VALID,
			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
			  I915_READ_START(engine),
			  (unsigned long)i915_gem_obj_ggtt_offset(obj));
626 627
		ret = -EIO;
		goto out;
628 629
	}

630
	ringbuf->last_retired_head = -1;
631 632
	ringbuf->head = I915_READ_HEAD(engine);
	ringbuf->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
633
	intel_ring_update_space(ringbuf);
634

635
	intel_engine_init_hangcheck(engine);
636

637
out:
638
	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
639 640

	return ret;
641 642
}

643
void intel_fini_pipe_control(struct intel_engine_cs *engine)
644
{
645
	if (engine->scratch.obj == NULL)
646 647
		return;

648
	i915_gem_object_ggtt_unpin(engine->scratch.obj);
649 650
	drm_gem_object_unreference(&engine->scratch.obj->base);
	engine->scratch.obj = NULL;
651 652
}

653
int intel_init_pipe_control(struct intel_engine_cs *engine, int size)
654
{
655
	struct drm_i915_gem_object *obj;
656 657
	int ret;

658
	WARN_ON(engine->scratch.obj);
659

660
	obj = i915_gem_object_create_stolen(&engine->i915->drm, size);
661
	if (!obj)
662
		obj = i915_gem_object_create(&engine->i915->drm, size);
663 664 665
	if (IS_ERR(obj)) {
		DRM_ERROR("Failed to allocate scratch page\n");
		ret = PTR_ERR(obj);
666 667
		goto err;
	}
668

669
	ret = i915_gem_obj_ggtt_pin(obj, 4096, PIN_HIGH);
670 671
	if (ret)
		goto err_unref;
672

673 674
	engine->scratch.obj = obj;
	engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(obj);
675
	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
676
			 engine->name, engine->scratch.gtt_offset);
677 678 679
	return 0;

err_unref:
680
	drm_gem_object_unreference(&engine->scratch.obj->base);
681 682 683 684
err:
	return ret;
}

685
static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
686
{
687
	struct intel_engine_cs *engine = req->engine;
688 689
	struct i915_workarounds *w = &req->i915->workarounds;
	int ret, i;
690

691
	if (w->count == 0)
692
		return 0;
693

694
	engine->gpu_caches_dirty = true;
695
	ret = intel_ring_flush_all_caches(req);
696 697
	if (ret)
		return ret;
698

699
	ret = intel_ring_begin(req, (w->count * 2 + 2));
700 701 702
	if (ret)
		return ret;

703
	intel_ring_emit(engine, MI_LOAD_REGISTER_IMM(w->count));
704
	for (i = 0; i < w->count; i++) {
705 706
		intel_ring_emit_reg(engine, w->reg[i].addr);
		intel_ring_emit(engine, w->reg[i].value);
707
	}
708
	intel_ring_emit(engine, MI_NOOP);
709

710
	intel_ring_advance(engine);
711

712
	engine->gpu_caches_dirty = true;
713
	ret = intel_ring_flush_all_caches(req);
714 715
	if (ret)
		return ret;
716

717
	DRM_DEBUG_DRIVER("Number of Workarounds emitted: %d\n", w->count);
718

719
	return 0;
720 721
}

722
static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
723 724 725
{
	int ret;

726
	ret = intel_ring_workarounds_emit(req);
727 728 729
	if (ret != 0)
		return ret;

730
	ret = i915_gem_render_state_init(req);
731
	if (ret)
732
		return ret;
733

734
	return 0;
735 736
}

737
static int wa_add(struct drm_i915_private *dev_priv,
738 739
		  i915_reg_t addr,
		  const u32 mask, const u32 val)
740 741 742 743 744 745 746 747 748 749 750 751 752
{
	const u32 idx = dev_priv->workarounds.count;

	if (WARN_ON(idx >= I915_MAX_WA_REGS))
		return -ENOSPC;

	dev_priv->workarounds.reg[idx].addr = addr;
	dev_priv->workarounds.reg[idx].value = val;
	dev_priv->workarounds.reg[idx].mask = mask;

	dev_priv->workarounds.count++;

	return 0;
753 754
}

755
#define WA_REG(addr, mask, val) do { \
756
		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
757 758
		if (r) \
			return r; \
759
	} while (0)
760 761

#define WA_SET_BIT_MASKED(addr, mask) \
762
	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
763 764

#define WA_CLR_BIT_MASKED(addr, mask) \
765
	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
766

767
#define WA_SET_FIELD_MASKED(addr, mask, value) \
768
	WA_REG(addr, mask, _MASKED_FIELD(mask, value))
769

770 771
#define WA_SET_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) | (mask))
#define WA_CLR_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) & ~(mask))
772

773
#define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val)
774

775 776
static int wa_ring_whitelist_reg(struct intel_engine_cs *engine,
				 i915_reg_t reg)
777
{
778
	struct drm_i915_private *dev_priv = engine->i915;
779
	struct i915_workarounds *wa = &dev_priv->workarounds;
780
	const uint32_t index = wa->hw_whitelist_count[engine->id];
781 782 783 784

	if (WARN_ON(index >= RING_MAX_NONPRIV_SLOTS))
		return -EINVAL;

785
	WA_WRITE(RING_FORCE_TO_NONPRIV(engine->mmio_base, index),
786
		 i915_mmio_reg_offset(reg));
787
	wa->hw_whitelist_count[engine->id]++;
788 789 790 791

	return 0;
}

792
static int gen8_init_workarounds(struct intel_engine_cs *engine)
793
{
794
	struct drm_i915_private *dev_priv = engine->i915;
795 796

	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
797

798 799 800
	/* WaDisableAsyncFlipPerfMode:bdw,chv */
	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);

801 802 803 804
	/* WaDisablePartialInstShootdown:bdw,chv */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

805 806 807 808 809
	/* Use Force Non-Coherent whenever executing a 3D context. This is a
	 * workaround for for a possible hang in the unlikely event a TLB
	 * invalidation occurs during a PSD flush.
	 */
	/* WaForceEnableNonCoherent:bdw,chv */
810
	/* WaHdcDisableFetchWhenMasked:bdw,chv */
811
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
812
			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
813 814
			  HDC_FORCE_NON_COHERENT);

815 816 817 818 819 820 821 822 823 824
	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
	 *  polygons in the same 8x4 pixel/sample area to be processed without
	 *  stalling waiting for the earlier ones to write to Hierarchical Z
	 *  buffer."
	 *
	 * This optimization is off by default for BDW and CHV; turn it on.
	 */
	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);

825 826 827
	/* Wa4x4STCOptimizationDisable:bdw,chv */
	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);

828 829 830 831 832 833 834 835 836 837 838 839
	/*
	 * BSpec recommends 8x4 when MSAA is used,
	 * however in practice 16x4 seems fastest.
	 *
	 * Note that PS/WM thread counts depend on the WIZ hashing
	 * disable bit, which we don't touch here, but it's good
	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
	 */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN6_WIZ_HASHING_MASK,
			    GEN6_WIZ_HASHING_16x4);

840 841 842
	return 0;
}

843
static int bdw_init_workarounds(struct intel_engine_cs *engine)
844
{
845
	struct drm_i915_private *dev_priv = engine->i915;
846
	int ret;
847

848
	ret = gen8_init_workarounds(engine);
849 850 851
	if (ret)
		return ret;

852
	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
853
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
854

855
	/* WaDisableDopClockGating:bdw */
856 857
	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
			  DOP_CLOCK_GATING_DISABLE);
858

859 860
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN8_SAMPLER_POWER_BYPASS_DIS);
861

862
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
863 864 865
			  /* WaForceContextSaveRestoreNonCoherent:bdw */
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
866
			  (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
867 868 869 870

	return 0;
}

871
static int chv_init_workarounds(struct intel_engine_cs *engine)
872
{
873
	struct drm_i915_private *dev_priv = engine->i915;
874
	int ret;
875

876
	ret = gen8_init_workarounds(engine);
877 878 879
	if (ret)
		return ret;

880
	/* WaDisableThreadStallDopClockGating:chv */
881
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
882

883 884 885
	/* Improve HiZ throughput on CHV. */
	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);

886 887 888
	return 0;
}

889
static int gen9_init_workarounds(struct intel_engine_cs *engine)
890
{
891
	struct drm_i915_private *dev_priv = engine->i915;
892
	int ret;
893

894 895 896
	/* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl */
	I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));

897
	/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl */
898 899 900
	I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
		   GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);

901
	/* WaDisableKillLogic:bxt,skl,kbl */
902 903 904
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   ECOCHK_DIS_TLB);

905 906
	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl */
	/* WaDisablePartialInstShootdown:skl,bxt,kbl */
907
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
908
			  FLOW_CONTROL_ENABLE |
909 910
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

911
	/* Syncing dependencies between camera and graphics:skl,bxt,kbl */
912 913 914
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);

915
	/* WaDisableDgMirrorFixInHalfSliceChicken5:skl,bxt */
916 917
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
918 919
		WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
				  GEN9_DG_MIRROR_FIX_ENABLE);
920

921
	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
922 923
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
924 925
		WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
				  GEN9_RHWO_OPTIMIZATION_DISABLE);
926 927 928 929 930
		/*
		 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
		 * but we do that in per ctx batchbuffer as there is an issue
		 * with this register not getting restored on ctx restore
		 */
931 932
	}

933 934
	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl */
	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl */
935 936 937
	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
			  GEN9_ENABLE_YV12_BUGFIX |
			  GEN9_ENABLE_GPGPU_PREEMPTION);
938

939 940
	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl */
	/* WaDisablePartialResolveInVc:skl,bxt,kbl */
941 942
	WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
					 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
943

944
	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl */
945 946 947
	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
			  GEN9_CCS_TLB_PREFETCH_ENABLE);

948
	/* WaDisableMaskBasedCammingInRCC:skl,bxt */
949 950
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_C0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
951 952 953
		WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
				  PIXEL_MASK_CAMMING_DISABLE);

954 955 956 957
	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
958

959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979
	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
	 * both tied to WaForceContextSaveRestoreNonCoherent
	 * in some hsds for skl. We keep the tie for all gen9. The
	 * documentation is a bit hazy and so we want to get common behaviour,
	 * even though there is no clear evidence we would need both on kbl/bxt.
	 * This area has been source of system hangs so we play it safe
	 * and mimic the skl regardless of what bspec says.
	 *
	 * Use Force Non-Coherent whenever executing a 3D context. This
	 * is a workaround for a possible hang in the unlikely event
	 * a TLB invalidation occurs during a PSD flush.
	 */

	/* WaForceEnableNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_NON_COHERENT);

	/* WaDisableHDCInvalidation:skl,bxt,kbl */
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   BDW_DISABLE_HDC_INVALIDATION);

980 981 982 983
	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl */
	if (IS_SKYLAKE(dev_priv) ||
	    IS_KABYLAKE(dev_priv) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
984 985 986
		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
				  GEN8_SAMPLER_POWER_BYPASS_DIS);

987
	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl */
988 989
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);

990
	/* WaOCLCoherentLineFlush:skl,bxt,kbl */
991 992 993
	I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) |
				    GEN8_LQSC_FLUSH_COHERENT_LINES));

994 995 996 997 998
	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */
	ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
	if (ret)
		return ret;

999
	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl */
1000
	ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1);
1001 1002 1003
	if (ret)
		return ret;

1004
	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl */
1005
	ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1);
1006 1007 1008
	if (ret)
		return ret;

1009 1010 1011
	return 0;
}

1012
static int skl_tune_iz_hashing(struct intel_engine_cs *engine)
1013
{
1014
	struct drm_i915_private *dev_priv = engine->i915;
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	u8 vals[3] = { 0, 0, 0 };
	unsigned int i;

	for (i = 0; i < 3; i++) {
		u8 ss;

		/*
		 * Only consider slices where one, and only one, subslice has 7
		 * EUs
		 */
1025
		if (!is_power_of_2(dev_priv->info.subslice_7eu[i]))
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
			continue;

		/*
		 * subslice_7eu[i] != 0 (because of the check above) and
		 * ss_max == 4 (maximum number of subslices possible per slice)
		 *
		 * ->    0 <= ss <= 3;
		 */
		ss = ffs(dev_priv->info.subslice_7eu[i]) - 1;
		vals[i] = 3 - ss;
	}

	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
		return 0;

	/* Tune IZ hashing. See intel_device_info_runtime_init() */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN9_IZ_HASHING_MASK(2) |
			    GEN9_IZ_HASHING_MASK(1) |
			    GEN9_IZ_HASHING_MASK(0),
			    GEN9_IZ_HASHING(2, vals[2]) |
			    GEN9_IZ_HASHING(1, vals[1]) |
			    GEN9_IZ_HASHING(0, vals[0]));

	return 0;
}

1053
static int skl_init_workarounds(struct intel_engine_cs *engine)
1054
{
1055
	struct drm_i915_private *dev_priv = engine->i915;
1056
	int ret;
1057

1058
	ret = gen9_init_workarounds(engine);
1059 1060
	if (ret)
		return ret;
1061

1062 1063 1064 1065 1066
	/*
	 * Actual WA is to disable percontext preemption granularity control
	 * until D0 which is the default case so this is equivalent to
	 * !WaDisablePerCtxtPreemptionGranularityControl:skl
	 */
1067
	if (IS_SKL_REVID(dev_priv, SKL_REVID_E0, REVID_FOREVER)) {
1068 1069 1070 1071
		I915_WRITE(GEN7_FF_SLICE_CS_CHICKEN1,
			   _MASKED_BIT_ENABLE(GEN9_FFSC_PERCTX_PREEMPT_CTRL));
	}

1072
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0)) {
1073 1074 1075 1076 1077 1078 1079 1080
		/* WaDisableChickenBitTSGBarrierAckForFFSliceCS:skl */
		I915_WRITE(FF_SLICE_CS_CHICKEN2,
			   _MASKED_BIT_ENABLE(GEN9_TSG_BARRIER_ACK_DISABLE));
	}

	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
1081
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0))
1082 1083 1084 1085 1086
		/* WaDisableLSQCROPERFforOCL:skl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

	/* WaEnableGapsTsvCreditFix:skl */
1087
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, REVID_FOREVER)) {
1088 1089 1090 1091
		I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
					   GEN9_GAPS_TSV_CREDIT_DISABLE));
	}

1092
	/* WaDisablePowerCompilerClockGating:skl */
1093
	if (IS_SKL_REVID(dev_priv, SKL_REVID_B0, SKL_REVID_B0))
1094 1095 1096
		WA_SET_BIT_MASKED(HIZ_CHICKEN,
				  BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);

1097
	/* WaBarrierPerformanceFixDisable:skl */
1098
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_D0))
1099 1100 1101 1102
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE |
				  HDC_BARRIER_PERFORMANCE_DISABLE);

1103
	/* WaDisableSbeCacheDispatchPortSharing:skl */
1104
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_F0))
1105 1106 1107 1108
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1109 1110 1111
	/* WaDisableGafsUnitClkGating:skl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1112
	/* WaDisableLSQCROPERFforOCL:skl */
1113
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1114 1115 1116
	if (ret)
		return ret;

1117
	return skl_tune_iz_hashing(engine);
1118 1119
}

1120
static int bxt_init_workarounds(struct intel_engine_cs *engine)
1121
{
1122
	struct drm_i915_private *dev_priv = engine->i915;
1123
	int ret;
1124

1125
	ret = gen9_init_workarounds(engine);
1126 1127
	if (ret)
		return ret;
1128

1129 1130
	/* WaStoreMultiplePTEenable:bxt */
	/* This is a requirement according to Hardware specification */
1131
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
1132 1133 1134
		I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF);

	/* WaSetClckGatingDisableMedia:bxt */
1135
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1136 1137 1138 1139
		I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
					    ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE));
	}

1140 1141 1142 1143
	/* WaDisableThreadStallDopClockGating:bxt */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  STALL_DOP_GATING_DISABLE);

1144 1145 1146 1147 1148 1149
	/* WaDisablePooledEuLoadBalancingFix:bxt */
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) {
		WA_SET_BIT_MASKED(FF_SLICE_CS_CHICKEN2,
				  GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
	}

1150
	/* WaDisableSbeCacheDispatchPortSharing:bxt */
1151
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) {
1152 1153 1154 1155 1156
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
	}

1157 1158 1159
	/* WaDisableObjectLevelPreemptionForTrifanOrPolygon:bxt */
	/* WaDisableObjectLevelPreemptionForInstancedDraw:bxt */
	/* WaDisableObjectLevelPreemtionForInstanceId:bxt */
1160
	/* WaDisableLSQCROPERFforOCL:bxt */
1161
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1162
		ret = wa_ring_whitelist_reg(engine, GEN9_CS_DEBUG_MODE1);
1163 1164
		if (ret)
			return ret;
1165

1166
		ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1167 1168
		if (ret)
			return ret;
1169 1170
	}

1171
	/* WaProgramL3SqcReg1DefaultForPerf:bxt */
1172
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
1173 1174
		I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
					   L3_HIGH_PRIO_CREDITS(2));
1175

1176 1177 1178 1179 1180
	/* WaInsertDummyPushConstPs:bxt */
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1181 1182 1183
	return 0;
}

1184 1185
static int kbl_init_workarounds(struct intel_engine_cs *engine)
{
1186
	struct drm_i915_private *dev_priv = engine->i915;
1187 1188 1189 1190 1191 1192
	int ret;

	ret = gen9_init_workarounds(engine);
	if (ret)
		return ret;

1193 1194 1195 1196
	/* WaEnableGapsTsvCreditFix:kbl */
	I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
				   GEN9_GAPS_TSV_CREDIT_DISABLE));

1197 1198 1199 1200 1201
	/* WaDisableDynamicCreditSharing:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT(GAMT_CHKN_BIT_REG,
			   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);

1202 1203 1204 1205 1206
	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE);

1207 1208 1209 1210 1211 1212 1213 1214
	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_E0))
		/* WaDisableLSQCROPERFforOCL:kbl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

1215 1216 1217 1218 1219
	/* WaInsertDummyPushConstPs:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1220 1221 1222
	/* WaDisableGafsUnitClkGating:kbl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1223 1224 1225 1226 1227
	/* WaDisableSbeCacheDispatchPortSharing:kbl */
	WA_SET_BIT_MASKED(
		GEN7_HALF_SLICE_CHICKEN1,
		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1228 1229 1230 1231 1232
	/* WaDisableLSQCROPERFforOCL:kbl */
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
	if (ret)
		return ret;

1233 1234 1235
	return 0;
}

1236
int init_workarounds_ring(struct intel_engine_cs *engine)
1237
{
1238
	struct drm_i915_private *dev_priv = engine->i915;
1239

1240
	WARN_ON(engine->id != RCS);
1241 1242

	dev_priv->workarounds.count = 0;
1243
	dev_priv->workarounds.hw_whitelist_count[RCS] = 0;
1244

1245
	if (IS_BROADWELL(dev_priv))
1246
		return bdw_init_workarounds(engine);
1247

1248
	if (IS_CHERRYVIEW(dev_priv))
1249
		return chv_init_workarounds(engine);
1250

1251
	if (IS_SKYLAKE(dev_priv))
1252
		return skl_init_workarounds(engine);
1253

1254
	if (IS_BROXTON(dev_priv))
1255
		return bxt_init_workarounds(engine);
1256

1257 1258 1259
	if (IS_KABYLAKE(dev_priv))
		return kbl_init_workarounds(engine);

1260 1261 1262
	return 0;
}

1263
static int init_render_ring(struct intel_engine_cs *engine)
1264
{
1265
	struct drm_i915_private *dev_priv = engine->i915;
1266
	int ret = init_ring_common(engine);
1267 1268
	if (ret)
		return ret;
1269

1270
	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1271
	if (IS_GEN(dev_priv, 4, 6))
1272
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
1273 1274 1275 1276

	/* We need to disable the AsyncFlip performance optimisations in order
	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
	 * programmed to '1' on all products.
1277
	 *
1278
	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1279
	 */
1280
	if (IS_GEN(dev_priv, 6, 7))
1281 1282
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));

1283
	/* Required for the hardware to program scanline values for waiting */
1284
	/* WaEnableFlushTlbInvalidationMode:snb */
1285
	if (IS_GEN6(dev_priv))
1286
		I915_WRITE(GFX_MODE,
1287
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
1288

1289
	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1290
	if (IS_GEN7(dev_priv))
1291
		I915_WRITE(GFX_MODE_GEN7,
1292
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
1293
			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
1294

1295
	if (IS_GEN6(dev_priv)) {
1296 1297 1298 1299 1300 1301
		/* From the Sandybridge PRM, volume 1 part 3, page 24:
		 * "If this bit is set, STCunit will have LRA as replacement
		 *  policy. [...] This bit must be reset.  LRA replacement
		 *  policy is not supported."
		 */
		I915_WRITE(CACHE_MODE_0,
1302
			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
1303 1304
	}

1305
	if (IS_GEN(dev_priv, 6, 7))
1306
		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1307

1308
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1309

1310
	return init_workarounds_ring(engine);
1311 1312
}

1313
static void render_ring_cleanup(struct intel_engine_cs *engine)
1314
{
1315
	struct drm_i915_private *dev_priv = engine->i915;
1316 1317 1318 1319 1320 1321

	if (dev_priv->semaphore_obj) {
		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
		dev_priv->semaphore_obj = NULL;
	}
1322

1323
	intel_fini_pipe_control(engine);
1324 1325
}

1326
static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
1327 1328 1329
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 8
1330
	struct intel_engine_cs *signaller = signaller_req->engine;
1331
	struct drm_i915_private *dev_priv = signaller_req->i915;
1332
	struct intel_engine_cs *waiter;
1333 1334
	enum intel_engine_id id;
	int ret, num_rings;
1335

1336
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1337 1338 1339
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1340
	ret = intel_ring_begin(signaller_req, num_dwords);
1341 1342 1343
	if (ret)
		return ret;

1344 1345
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1346 1347 1348 1349 1350 1351
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
					   PIPE_CONTROL_QW_WRITE |
1352
					   PIPE_CONTROL_CS_STALL);
1353 1354
		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1355
		intel_ring_emit(signaller, signaller_req->seqno);
1356 1357
		intel_ring_emit(signaller, 0);
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1358
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1359 1360 1361 1362 1363 1364
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1365
static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
1366 1367 1368
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 6
1369
	struct intel_engine_cs *signaller = signaller_req->engine;
1370
	struct drm_i915_private *dev_priv = signaller_req->i915;
1371
	struct intel_engine_cs *waiter;
1372 1373
	enum intel_engine_id id;
	int ret, num_rings;
1374

1375
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1376 1377 1378
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1379
	ret = intel_ring_begin(signaller_req, num_dwords);
1380 1381 1382
	if (ret)
		return ret;

1383 1384
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1385 1386 1387 1388 1389 1390 1391 1392
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
					   MI_FLUSH_DW_OP_STOREDW);
		intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
					   MI_FLUSH_DW_USE_GTT);
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1393
		intel_ring_emit(signaller, signaller_req->seqno);
1394
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1395
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1396 1397 1398 1399 1400 1401
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1402
static int gen6_signal(struct drm_i915_gem_request *signaller_req,
1403
		       unsigned int num_dwords)
1404
{
1405
	struct intel_engine_cs *signaller = signaller_req->engine;
1406
	struct drm_i915_private *dev_priv = signaller_req->i915;
1407
	struct intel_engine_cs *useless;
1408 1409
	enum intel_engine_id id;
	int ret, num_rings;
1410

1411
#define MBOX_UPDATE_DWORDS 3
1412
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1413 1414
	num_dwords += round_up((num_rings-1) * MBOX_UPDATE_DWORDS, 2);
#undef MBOX_UPDATE_DWORDS
1415

1416
	ret = intel_ring_begin(signaller_req, num_dwords);
1417 1418 1419
	if (ret)
		return ret;

1420 1421
	for_each_engine_id(useless, dev_priv, id) {
		i915_reg_t mbox_reg = signaller->semaphore.mbox.signal[id];
1422 1423

		if (i915_mmio_reg_valid(mbox_reg)) {
1424
			intel_ring_emit(signaller, MI_LOAD_REGISTER_IMM(1));
1425
			intel_ring_emit_reg(signaller, mbox_reg);
1426
			intel_ring_emit(signaller, signaller_req->seqno);
1427 1428
		}
	}
1429

1430 1431 1432 1433
	/* If num_dwords was rounded, make sure the tail pointer is correct */
	if (num_rings % 2 == 0)
		intel_ring_emit(signaller, MI_NOOP);

1434
	return 0;
1435 1436
}

1437 1438
/**
 * gen6_add_request - Update the semaphore mailbox registers
1439 1440
 *
 * @request - request to write to the ring
1441 1442 1443 1444
 *
 * Update the mailbox registers in the *other* rings with the current seqno.
 * This acts like a signal in the canonical semaphore.
 */
1445
static int
1446
gen6_add_request(struct drm_i915_gem_request *req)
1447
{
1448
	struct intel_engine_cs *engine = req->engine;
1449
	int ret;
1450

1451 1452
	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 4);
B
Ben Widawsky 已提交
1453
	else
1454
		ret = intel_ring_begin(req, 4);
B
Ben Widawsky 已提交
1455

1456 1457 1458
	if (ret)
		return ret;

1459 1460 1461
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1462
	intel_ring_emit(engine, req->seqno);
1463 1464
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1465 1466 1467 1468

	return 0;
}

1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497
static int
gen8_render_add_request(struct drm_i915_gem_request *req)
{
	struct intel_engine_cs *engine = req->engine;
	int ret;

	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 8);
	else
		ret = intel_ring_begin(req, 8);
	if (ret)
		return ret;

	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, (PIPE_CONTROL_GLOBAL_GTT_IVB |
				 PIPE_CONTROL_CS_STALL |
				 PIPE_CONTROL_QW_WRITE));
	intel_ring_emit(engine, intel_hws_seqno_address(req->engine));
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, i915_gem_request_get_seqno(req));
	/* We're thrashing one dword of HWS. */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	intel_ring_emit(engine, MI_NOOP);
	__intel_ring_advance(engine);

	return 0;
}

1498
static inline bool i915_gem_has_seqno_wrapped(struct drm_i915_private *dev_priv,
1499 1500 1501 1502 1503
					      u32 seqno)
{
	return dev_priv->last_seqno < seqno;
}

1504 1505 1506 1507 1508 1509 1510
/**
 * intel_ring_sync - sync the waiter to the signaller on seqno
 *
 * @waiter - ring that is waiting
 * @signaller - ring which has, or will signal
 * @seqno - seqno which the waiter will block on
 */
1511 1512

static int
1513
gen8_ring_sync(struct drm_i915_gem_request *waiter_req,
1514 1515 1516
	       struct intel_engine_cs *signaller,
	       u32 seqno)
{
1517
	struct intel_engine_cs *waiter = waiter_req->engine;
1518
	struct drm_i915_private *dev_priv = waiter_req->i915;
1519
	u64 offset = GEN8_WAIT_OFFSET(waiter, signaller->id);
1520
	struct i915_hw_ppgtt *ppgtt;
1521 1522
	int ret;

1523
	ret = intel_ring_begin(waiter_req, 4);
1524 1525 1526 1527 1528 1529 1530
	if (ret)
		return ret;

	intel_ring_emit(waiter, MI_SEMAPHORE_WAIT |
				MI_SEMAPHORE_GLOBAL_GTT |
				MI_SEMAPHORE_SAD_GTE_SDD);
	intel_ring_emit(waiter, seqno);
1531 1532
	intel_ring_emit(waiter, lower_32_bits(offset));
	intel_ring_emit(waiter, upper_32_bits(offset));
1533
	intel_ring_advance(waiter);
1534 1535 1536 1537 1538 1539 1540 1541 1542

	/* When the !RCS engines idle waiting upon a semaphore, they lose their
	 * pagetables and we must reload them before executing the batch.
	 * We do this on the i915_switch_context() following the wait and
	 * before the dispatch.
	 */
	ppgtt = waiter_req->ctx->ppgtt;
	if (ppgtt && waiter_req->engine->id != RCS)
		ppgtt->pd_dirty_rings |= intel_engine_flag(waiter_req->engine);
1543 1544 1545
	return 0;
}

1546
static int
1547
gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
1548
	       struct intel_engine_cs *signaller,
1549
	       u32 seqno)
1550
{
1551
	struct intel_engine_cs *waiter = waiter_req->engine;
1552 1553 1554
	u32 dw1 = MI_SEMAPHORE_MBOX |
		  MI_SEMAPHORE_COMPARE |
		  MI_SEMAPHORE_REGISTER;
1555 1556
	u32 wait_mbox = signaller->semaphore.mbox.wait[waiter->id];
	int ret;
1557

1558 1559 1560 1561 1562 1563
	/* Throughout all of the GEM code, seqno passed implies our current
	 * seqno is >= the last seqno executed. However for hardware the
	 * comparison is strictly greater than.
	 */
	seqno -= 1;

1564
	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
1565

1566
	ret = intel_ring_begin(waiter_req, 4);
1567 1568 1569
	if (ret)
		return ret;

1570
	/* If seqno wrap happened, omit the wait with no-ops */
1571
	if (likely(!i915_gem_has_seqno_wrapped(waiter_req->i915, seqno))) {
1572
		intel_ring_emit(waiter, dw1 | wait_mbox);
1573 1574 1575 1576 1577 1578 1579 1580 1581
		intel_ring_emit(waiter, seqno);
		intel_ring_emit(waiter, 0);
		intel_ring_emit(waiter, MI_NOOP);
	} else {
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
	}
1582
	intel_ring_advance(waiter);
1583 1584 1585 1586

	return 0;
}

1587 1588
static void
gen5_seqno_barrier(struct intel_engine_cs *ring)
1589
{
1590 1591 1592
	/* MI_STORE are internally buffered by the GPU and not flushed
	 * either by MI_FLUSH or SyncFlush or any other combination of
	 * MI commands.
1593
	 *
1594 1595 1596 1597 1598 1599 1600
	 * "Only the submission of the store operation is guaranteed.
	 * The write result will be complete (coherent) some time later
	 * (this is practically a finite period but there is no guaranteed
	 * latency)."
	 *
	 * Empirically, we observe that we need a delay of at least 75us to
	 * be sure that the seqno write is visible by the CPU.
1601
	 */
1602
	usleep_range(125, 250);
1603 1604
}

1605 1606
static void
gen6_seqno_barrier(struct intel_engine_cs *engine)
1607
{
1608
	struct drm_i915_private *dev_priv = engine->i915;
1609

1610 1611
	/* Workaround to force correct ordering between irq and seqno writes on
	 * ivb (and maybe also on snb) by reading from a CS register (like
1612 1613 1614 1615 1616 1617 1618 1619 1620
	 * ACTHD) before reading the status page.
	 *
	 * Note that this effectively stalls the read by the time it takes to
	 * do a memory transaction, which more or less ensures that the write
	 * from the GPU has sufficient time to invalidate the CPU cacheline.
	 * Alternatively we could delay the interrupt from the CS ring to give
	 * the write time to land, but that would incur a delay after every
	 * batch i.e. much more frequent than a delay when waiting for the
	 * interrupt (with the same net latency).
1621 1622 1623
	 *
	 * Also note that to prevent whole machine hangs on gen7, we have to
	 * take the spinlock to guard against concurrent cacheline access.
1624
	 */
1625
	spin_lock_irq(&dev_priv->uncore.lock);
1626
	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
1627
	spin_unlock_irq(&dev_priv->uncore.lock);
1628 1629
}

1630 1631
static void
gen5_irq_enable(struct intel_engine_cs *engine)
1632
{
1633
	gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1634 1635 1636
}

static void
1637
gen5_irq_disable(struct intel_engine_cs *engine)
1638
{
1639
	gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1640 1641
}

1642 1643
static void
i9xx_irq_enable(struct intel_engine_cs *engine)
1644
{
1645
	struct drm_i915_private *dev_priv = engine->i915;
1646

1647 1648 1649
	dev_priv->irq_mask &= ~engine->irq_enable_mask;
	I915_WRITE(IMR, dev_priv->irq_mask);
	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1650 1651
}

1652
static void
1653
i9xx_irq_disable(struct intel_engine_cs *engine)
1654
{
1655
	struct drm_i915_private *dev_priv = engine->i915;
1656

1657 1658
	dev_priv->irq_mask |= engine->irq_enable_mask;
	I915_WRITE(IMR, dev_priv->irq_mask);
1659 1660
}

1661 1662
static void
i8xx_irq_enable(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1663
{
1664
	struct drm_i915_private *dev_priv = engine->i915;
C
Chris Wilson 已提交
1665

1666 1667 1668
	dev_priv->irq_mask &= ~engine->irq_enable_mask;
	I915_WRITE16(IMR, dev_priv->irq_mask);
	POSTING_READ16(RING_IMR(engine->mmio_base));
C
Chris Wilson 已提交
1669 1670 1671
}

static void
1672
i8xx_irq_disable(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1673
{
1674
	struct drm_i915_private *dev_priv = engine->i915;
C
Chris Wilson 已提交
1675

1676 1677
	dev_priv->irq_mask |= engine->irq_enable_mask;
	I915_WRITE16(IMR, dev_priv->irq_mask);
C
Chris Wilson 已提交
1678 1679
}

1680
static int
1681
bsd_ring_flush(struct drm_i915_gem_request *req,
1682 1683
	       u32     invalidate_domains,
	       u32     flush_domains)
1684
{
1685
	struct intel_engine_cs *engine = req->engine;
1686 1687
	int ret;

1688
	ret = intel_ring_begin(req, 2);
1689 1690 1691
	if (ret)
		return ret;

1692 1693 1694
	intel_ring_emit(engine, MI_FLUSH);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1695
	return 0;
1696 1697
}

1698
static int
1699
i9xx_add_request(struct drm_i915_gem_request *req)
1700
{
1701
	struct intel_engine_cs *engine = req->engine;
1702 1703
	int ret;

1704
	ret = intel_ring_begin(req, 4);
1705 1706
	if (ret)
		return ret;
1707

1708 1709 1710
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1711
	intel_ring_emit(engine, req->seqno);
1712 1713
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1714

1715
	return 0;
1716 1717
}

1718 1719
static void
gen6_irq_enable(struct intel_engine_cs *engine)
1720
{
1721
	struct drm_i915_private *dev_priv = engine->i915;
1722

1723 1724 1725
	I915_WRITE_IMR(engine,
		       ~(engine->irq_enable_mask |
			 engine->irq_keep_mask));
1726
	gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1727 1728 1729
}

static void
1730
gen6_irq_disable(struct intel_engine_cs *engine)
1731
{
1732
	struct drm_i915_private *dev_priv = engine->i915;
1733

1734
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1735
	gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1736 1737
}

1738 1739
static void
hsw_vebox_irq_enable(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1740
{
1741
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1742

1743 1744
	I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
	gen6_enable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1745 1746 1747
}

static void
1748
hsw_vebox_irq_disable(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1749
{
1750
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1751

1752 1753
	I915_WRITE_IMR(engine, ~0);
	gen6_disable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1754 1755
}

1756 1757
static void
gen8_irq_enable(struct intel_engine_cs *engine)
1758
{
1759
	struct drm_i915_private *dev_priv = engine->i915;
1760

1761 1762 1763
	I915_WRITE_IMR(engine,
		       ~(engine->irq_enable_mask |
			 engine->irq_keep_mask));
1764
	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1765 1766 1767
}

static void
1768
gen8_irq_disable(struct intel_engine_cs *engine)
1769
{
1770
	struct drm_i915_private *dev_priv = engine->i915;
1771

1772
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1773 1774
}

1775
static int
1776
i965_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1777
			 u64 offset, u32 length,
1778
			 unsigned dispatch_flags)
1779
{
1780
	struct intel_engine_cs *engine = req->engine;
1781
	int ret;
1782

1783
	ret = intel_ring_begin(req, 2);
1784 1785 1786
	if (ret)
		return ret;

1787
	intel_ring_emit(engine,
1788 1789
			MI_BATCH_BUFFER_START |
			MI_BATCH_GTT |
1790 1791
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
1792 1793
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
1794

1795 1796 1797
	return 0;
}

1798 1799
/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
#define I830_BATCH_LIMIT (256*1024)
1800 1801
#define I830_TLB_ENTRIES (2)
#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1802
static int
1803
i830_dispatch_execbuffer(struct drm_i915_gem_request *req,
1804 1805
			 u64 offset, u32 len,
			 unsigned dispatch_flags)
1806
{
1807
	struct intel_engine_cs *engine = req->engine;
1808
	u32 cs_offset = engine->scratch.gtt_offset;
1809
	int ret;
1810

1811
	ret = intel_ring_begin(req, 6);
1812 1813
	if (ret)
		return ret;
1814

1815
	/* Evict the invalid PTE TLBs */
1816 1817 1818 1819 1820 1821 1822
	intel_ring_emit(engine, COLOR_BLT_CMD | BLT_WRITE_RGBA);
	intel_ring_emit(engine, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
	intel_ring_emit(engine, I830_TLB_ENTRIES << 16 | 4); /* load each page */
	intel_ring_emit(engine, cs_offset);
	intel_ring_emit(engine, 0xdeadbeef);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1823

1824
	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1825 1826 1827
		if (len > I830_BATCH_LIMIT)
			return -ENOSPC;

1828
		ret = intel_ring_begin(req, 6 + 2);
1829 1830
		if (ret)
			return ret;
1831 1832 1833 1834 1835

		/* Blit the batch (which has now all relocs applied) to the
		 * stable batch scratch bo area (so that the CS never
		 * stumbles over its tlb invalidation bug) ...
		 */
1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
		intel_ring_emit(engine, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
		intel_ring_emit(engine,
				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
		intel_ring_emit(engine, DIV_ROUND_UP(len, 4096) << 16 | 4096);
		intel_ring_emit(engine, cs_offset);
		intel_ring_emit(engine, 4096);
		intel_ring_emit(engine, offset);

		intel_ring_emit(engine, MI_FLUSH);
		intel_ring_emit(engine, MI_NOOP);
		intel_ring_advance(engine);
1847 1848

		/* ... and execute it. */
1849
		offset = cs_offset;
1850
	}
1851

1852
	ret = intel_ring_begin(req, 2);
1853 1854 1855
	if (ret)
		return ret;

1856 1857 1858 1859
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1860

1861 1862 1863 1864
	return 0;
}

static int
1865
i915_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1866
			 u64 offset, u32 len,
1867
			 unsigned dispatch_flags)
1868
{
1869
	struct intel_engine_cs *engine = req->engine;
1870 1871
	int ret;

1872
	ret = intel_ring_begin(req, 2);
1873 1874 1875
	if (ret)
		return ret;

1876 1877 1878 1879
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1880 1881 1882 1883

	return 0;
}

1884
static void cleanup_phys_status_page(struct intel_engine_cs *engine)
1885
{
1886
	struct drm_i915_private *dev_priv = engine->i915;
1887 1888 1889 1890

	if (!dev_priv->status_page_dmah)
		return;

1891
	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
1892
	engine->status_page.page_addr = NULL;
1893 1894
}

1895
static void cleanup_status_page(struct intel_engine_cs *engine)
1896
{
1897
	struct drm_i915_gem_object *obj;
1898

1899
	obj = engine->status_page.obj;
1900
	if (obj == NULL)
1901 1902
		return;

1903
	kunmap(sg_page(obj->pages->sgl));
B
Ben Widawsky 已提交
1904
	i915_gem_object_ggtt_unpin(obj);
1905
	drm_gem_object_unreference(&obj->base);
1906
	engine->status_page.obj = NULL;
1907 1908
}

1909
static int init_status_page(struct intel_engine_cs *engine)
1910
{
1911
	struct drm_i915_gem_object *obj = engine->status_page.obj;
1912

1913
	if (obj == NULL) {
1914
		unsigned flags;
1915
		int ret;
1916

1917
		obj = i915_gem_object_create(&engine->i915->drm, 4096);
1918
		if (IS_ERR(obj)) {
1919
			DRM_ERROR("Failed to allocate status page\n");
1920
			return PTR_ERR(obj);
1921
		}
1922

1923 1924 1925 1926
		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
		if (ret)
			goto err_unref;

1927
		flags = 0;
1928
		if (!HAS_LLC(engine->i915))
1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940
			/* On g33, we cannot place HWS above 256MiB, so
			 * restrict its pinning to the low mappable arena.
			 * Though this restriction is not documented for
			 * gen4, gen5, or byt, they also behave similarly
			 * and hang if the HWS is placed at the top of the
			 * GTT. To generalise, it appears that all !llc
			 * platforms have issues with us placing the HWS
			 * above the mappable region (even though we never
			 * actualy map it).
			 */
			flags |= PIN_MAPPABLE;
		ret = i915_gem_obj_ggtt_pin(obj, 4096, flags);
1941 1942 1943 1944 1945 1946
		if (ret) {
err_unref:
			drm_gem_object_unreference(&obj->base);
			return ret;
		}

1947
		engine->status_page.obj = obj;
1948
	}
1949

1950 1951 1952
	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(obj);
	engine->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
1953

1954
	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
1955
			engine->name, engine->status_page.gfx_addr);
1956 1957 1958 1959

	return 0;
}

1960
static int init_phys_status_page(struct intel_engine_cs *engine)
1961
{
1962
	struct drm_i915_private *dev_priv = engine->i915;
1963 1964 1965

	if (!dev_priv->status_page_dmah) {
		dev_priv->status_page_dmah =
1966
			drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
1967 1968 1969 1970
		if (!dev_priv->status_page_dmah)
			return -ENOMEM;
	}

1971 1972
	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
1973 1974 1975 1976

	return 0;
}

1977
void intel_unpin_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
1978
{
1979 1980 1981
	GEM_BUG_ON(ringbuf->vma == NULL);
	GEM_BUG_ON(ringbuf->virtual_start == NULL);

1982
	if (HAS_LLC(ringbuf->obj->base.dev) && !ringbuf->obj->stolen)
1983
		i915_gem_object_unpin_map(ringbuf->obj);
1984
	else
1985
		i915_vma_unpin_iomap(ringbuf->vma);
1986
	ringbuf->virtual_start = NULL;
1987

1988
	i915_gem_object_ggtt_unpin(ringbuf->obj);
1989
	ringbuf->vma = NULL;
1990 1991
}

1992
int intel_pin_and_map_ringbuffer_obj(struct drm_i915_private *dev_priv,
1993 1994 1995
				     struct intel_ringbuffer *ringbuf)
{
	struct drm_i915_gem_object *obj = ringbuf->obj;
1996 1997
	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
	unsigned flags = PIN_OFFSET_BIAS | 4096;
1998
	void *addr;
1999 2000
	int ret;

2001
	if (HAS_LLC(dev_priv) && !obj->stolen) {
2002
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE, flags);
2003 2004
		if (ret)
			return ret;
2005

2006
		ret = i915_gem_object_set_to_cpu_domain(obj, true);
2007 2008
		if (ret)
			goto err_unpin;
2009

2010 2011 2012
		addr = i915_gem_object_pin_map(obj);
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2013
			goto err_unpin;
2014 2015
		}
	} else {
2016 2017
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE,
					    flags | PIN_MAPPABLE);
2018 2019
		if (ret)
			return ret;
2020

2021
		ret = i915_gem_object_set_to_gtt_domain(obj, true);
2022 2023
		if (ret)
			goto err_unpin;
2024

2025 2026 2027
		/* Access through the GTT requires the device to be awake. */
		assert_rpm_wakelock_held(dev_priv);

2028 2029 2030
		addr = i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2031
			goto err_unpin;
2032
		}
2033 2034
	}

2035
	ringbuf->virtual_start = addr;
2036
	ringbuf->vma = i915_gem_obj_to_ggtt(obj);
2037
	return 0;
2038 2039 2040 2041

err_unpin:
	i915_gem_object_ggtt_unpin(obj);
	return ret;
2042 2043
}

2044
static void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2045
{
2046 2047 2048 2049
	drm_gem_object_unreference(&ringbuf->obj->base);
	ringbuf->obj = NULL;
}

2050 2051
static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
				      struct intel_ringbuffer *ringbuf)
2052
{
2053
	struct drm_i915_gem_object *obj;
2054

2055 2056
	obj = NULL;
	if (!HAS_LLC(dev))
2057
		obj = i915_gem_object_create_stolen(dev, ringbuf->size);
2058
	if (obj == NULL)
2059
		obj = i915_gem_object_create(dev, ringbuf->size);
2060 2061
	if (IS_ERR(obj))
		return PTR_ERR(obj);
2062

2063 2064 2065
	/* mark ring buffers as read-only from GPU side by default */
	obj->gt_ro = 1;

2066
	ringbuf->obj = obj;
2067

2068
	return 0;
2069 2070
}

2071 2072 2073 2074 2075 2076 2077
struct intel_ringbuffer *
intel_engine_create_ringbuffer(struct intel_engine_cs *engine, int size)
{
	struct intel_ringbuffer *ring;
	int ret;

	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
2078 2079 2080
	if (ring == NULL) {
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n",
				 engine->name);
2081
		return ERR_PTR(-ENOMEM);
2082
	}
2083

2084
	ring->engine = engine;
2085
	list_add(&ring->link, &engine->buffers);
2086 2087 2088 2089 2090 2091 2092

	ring->size = size;
	/* Workaround an erratum on the i830 which causes a hang if
	 * the TAIL pointer points to within the last 2 cachelines
	 * of the buffer.
	 */
	ring->effective_size = size;
2093
	if (IS_I830(engine->i915) || IS_845G(engine->i915))
2094 2095 2096 2097 2098
		ring->effective_size -= 2 * CACHELINE_BYTES;

	ring->last_retired_head = -1;
	intel_ring_update_space(ring);

2099
	ret = intel_alloc_ringbuffer_obj(&engine->i915->drm, ring);
2100
	if (ret) {
2101 2102 2103
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s: %d\n",
				 engine->name, ret);
		list_del(&ring->link);
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
		kfree(ring);
		return ERR_PTR(ret);
	}

	return ring;
}

void
intel_ringbuffer_free(struct intel_ringbuffer *ring)
{
	intel_destroy_ringbuffer_obj(ring);
2115
	list_del(&ring->link);
2116 2117 2118
	kfree(ring);
}

2119 2120 2121 2122 2123 2124
static int intel_ring_context_pin(struct i915_gem_context *ctx,
				  struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];
	int ret;

2125
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
2126 2127 2128 2129 2130 2131 2132 2133 2134 2135

	if (ce->pin_count++)
		return 0;

	if (ce->state) {
		ret = i915_gem_obj_ggtt_pin(ce->state, ctx->ggtt_alignment, 0);
		if (ret)
			goto error;
	}

2136 2137 2138 2139 2140 2141 2142 2143 2144 2145
	/* The kernel context is only used as a placeholder for flushing the
	 * active context. It is never used for submitting user rendering and
	 * as such never requires the golden render context, and so we can skip
	 * emitting it when we switch to the kernel context. This is required
	 * as during eviction we cannot allocate and pin the renderstate in
	 * order to initialise the context.
	 */
	if (ctx == ctx->i915->kernel_context)
		ce->initialised = true;

2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
	i915_gem_context_reference(ctx);
	return 0;

error:
	ce->pin_count = 0;
	return ret;
}

static void intel_ring_context_unpin(struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];

2159
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
2160 2161 2162 2163 2164 2165 2166 2167 2168 2169

	if (--ce->pin_count)
		return;

	if (ce->state)
		i915_gem_object_ggtt_unpin(ce->state);

	i915_gem_context_unreference(ctx);
}

2170
static int intel_init_ring_buffer(struct drm_device *dev,
2171
				  struct intel_engine_cs *engine)
2172
{
2173
	struct drm_i915_private *dev_priv = to_i915(dev);
2174
	struct intel_ringbuffer *ringbuf;
2175 2176
	int ret;

2177
	WARN_ON(engine->buffer);
2178

2179
	engine->i915 = dev_priv;
2180 2181 2182 2183 2184 2185 2186
	INIT_LIST_HEAD(&engine->active_list);
	INIT_LIST_HEAD(&engine->request_list);
	INIT_LIST_HEAD(&engine->execlist_queue);
	INIT_LIST_HEAD(&engine->buffers);
	i915_gem_batch_pool_init(dev, &engine->batch_pool);
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2187

2188 2189 2190
	ret = intel_engine_init_breadcrumbs(engine);
	if (ret)
		goto error;
2191

2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202
	/* We may need to do things with the shrinker which
	 * require us to immediately switch back to the default
	 * context. This can cause a problem as pinning the
	 * default context also requires GTT space which may not
	 * be available. To avoid this we always pin the default
	 * context.
	 */
	ret = intel_ring_context_pin(dev_priv->kernel_context, engine);
	if (ret)
		goto error;

2203
	ringbuf = intel_engine_create_ringbuffer(engine, 32 * PAGE_SIZE);
2204 2205 2206 2207
	if (IS_ERR(ringbuf)) {
		ret = PTR_ERR(ringbuf);
		goto error;
	}
2208
	engine->buffer = ringbuf;
2209

2210
	if (I915_NEED_GFX_HWS(dev_priv)) {
2211
		ret = init_status_page(engine);
2212
		if (ret)
2213
			goto error;
2214
	} else {
2215 2216
		WARN_ON(engine->id != RCS);
		ret = init_phys_status_page(engine);
2217
		if (ret)
2218
			goto error;
2219 2220
	}

2221
	ret = intel_pin_and_map_ringbuffer_obj(dev_priv, ringbuf);
2222 2223
	if (ret) {
		DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
2224
				engine->name, ret);
2225 2226
		intel_destroy_ringbuffer_obj(ringbuf);
		goto error;
2227
	}
2228

2229
	ret = i915_cmd_parser_init_ring(engine);
2230
	if (ret)
2231 2232 2233
		goto error;

	return 0;
2234

2235
error:
2236
	intel_cleanup_engine(engine);
2237
	return ret;
2238 2239
}

2240
void intel_cleanup_engine(struct intel_engine_cs *engine)
2241
{
2242
	struct drm_i915_private *dev_priv;
2243

2244
	if (!intel_engine_initialized(engine))
2245 2246
		return;

2247
	dev_priv = engine->i915;
2248

2249
	if (engine->buffer) {
2250
		intel_stop_engine(engine);
2251
		WARN_ON(!IS_GEN2(dev_priv) && (I915_READ_MODE(engine) & MODE_IDLE) == 0);
2252

2253 2254 2255
		intel_unpin_ringbuffer_obj(engine->buffer);
		intel_ringbuffer_free(engine->buffer);
		engine->buffer = NULL;
2256
	}
2257

2258 2259
	if (engine->cleanup)
		engine->cleanup(engine);
Z
Zou Nan hai 已提交
2260

2261
	if (I915_NEED_GFX_HWS(dev_priv)) {
2262
		cleanup_status_page(engine);
2263
	} else {
2264 2265
		WARN_ON(engine->id != RCS);
		cleanup_phys_status_page(engine);
2266
	}
2267

2268 2269
	i915_cmd_parser_fini_ring(engine);
	i915_gem_batch_pool_fini(&engine->batch_pool);
2270
	intel_engine_fini_breadcrumbs(engine);
2271 2272 2273

	intel_ring_context_unpin(dev_priv->kernel_context, engine);

2274
	engine->i915 = NULL;
2275 2276
}

2277
int intel_engine_idle(struct intel_engine_cs *engine)
2278
{
2279
	struct drm_i915_gem_request *req;
2280 2281

	/* Wait upon the last request to be completed */
2282
	if (list_empty(&engine->request_list))
2283 2284
		return 0;

2285 2286 2287
	req = list_entry(engine->request_list.prev,
			 struct drm_i915_gem_request,
			 list);
2288 2289 2290

	/* Make sure we do not trigger any retires */
	return __i915_wait_request(req,
2291
				   req->i915->mm.interruptible,
2292
				   NULL, NULL);
2293 2294
}

2295
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
2296
{
2297 2298 2299 2300 2301 2302
	int ret;

	/* Flush enough space to reduce the likelihood of waiting after
	 * we start building the request - in which case we will just
	 * have to repeat work.
	 */
2303
	request->reserved_space += LEGACY_REQUEST_SIZE;
2304

2305
	request->ringbuf = request->engine->buffer;
2306 2307 2308 2309 2310

	ret = intel_ring_begin(request, 0);
	if (ret)
		return ret;

2311
	request->reserved_space -= LEGACY_REQUEST_SIZE;
2312
	return 0;
2313 2314
}

2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333
static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
{
	struct intel_ringbuffer *ringbuf = req->ringbuf;
	struct intel_engine_cs *engine = req->engine;
	struct drm_i915_gem_request *target;

	intel_ring_update_space(ringbuf);
	if (ringbuf->space >= bytes)
		return 0;

	/*
	 * Space is reserved in the ringbuffer for finalising the request,
	 * as that cannot be allowed to fail. During request finalisation,
	 * reserved_space is set to 0 to stop the overallocation and the
	 * assumption is that then we never need to wait (which has the
	 * risk of failing with EINTR).
	 *
	 * See also i915_gem_request_alloc() and i915_add_request().
	 */
2334
	GEM_BUG_ON(!req->reserved_space);
2335 2336 2337 2338

	list_for_each_entry(target, &engine->request_list, list) {
		unsigned space;

2339
		/*
2340 2341 2342
		 * The request queue is per-engine, so can contain requests
		 * from multiple ringbuffers. Here, we must ignore any that
		 * aren't from the ringbuffer we're considering.
2343
		 */
2344 2345 2346 2347 2348 2349 2350 2351
		if (target->ringbuf != ringbuf)
			continue;

		/* Would completion of this request free enough space? */
		space = __intel_ring_space(target->postfix, ringbuf->tail,
					   ringbuf->size);
		if (space >= bytes)
			break;
2352
	}
2353

2354 2355 2356 2357
	if (WARN_ON(&target->list == &engine->request_list))
		return -ENOSPC;

	return i915_wait_request(target);
2358 2359
}

2360
int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
M
Mika Kuoppala 已提交
2361
{
2362
	struct intel_ringbuffer *ringbuf = req->ringbuf;
2363
	int remain_actual = ringbuf->size - ringbuf->tail;
2364 2365 2366
	int remain_usable = ringbuf->effective_size - ringbuf->tail;
	int bytes = num_dwords * sizeof(u32);
	int total_bytes, wait_bytes;
2367
	bool need_wrap = false;
2368

2369
	total_bytes = bytes + req->reserved_space;
2370

2371 2372 2373 2374 2375 2376 2377
	if (unlikely(bytes > remain_usable)) {
		/*
		 * Not enough space for the basic request. So need to flush
		 * out the remainder and then wait for base + reserved.
		 */
		wait_bytes = remain_actual + total_bytes;
		need_wrap = true;
2378 2379 2380 2381 2382 2383 2384
	} else if (unlikely(total_bytes > remain_usable)) {
		/*
		 * The base request will fit but the reserved space
		 * falls off the end. So we don't need an immediate wrap
		 * and only need to effectively wait for the reserved
		 * size space from the start of ringbuffer.
		 */
2385
		wait_bytes = remain_actual + req->reserved_space;
2386
	} else {
2387 2388
		/* No wrapping required, just waiting. */
		wait_bytes = total_bytes;
M
Mika Kuoppala 已提交
2389 2390
	}

2391 2392
	if (wait_bytes > ringbuf->space) {
		int ret = wait_for_space(req, wait_bytes);
M
Mika Kuoppala 已提交
2393 2394
		if (unlikely(ret))
			return ret;
2395

2396
		intel_ring_update_space(ringbuf);
2397 2398
		if (unlikely(ringbuf->space < wait_bytes))
			return -EAGAIN;
M
Mika Kuoppala 已提交
2399 2400
	}

2401 2402 2403
	if (unlikely(need_wrap)) {
		GEM_BUG_ON(remain_actual > ringbuf->space);
		GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
2404

2405 2406 2407 2408 2409 2410
		/* Fill the tail with MI_NOOP */
		memset(ringbuf->virtual_start + ringbuf->tail,
		       0, remain_actual);
		ringbuf->tail = 0;
		ringbuf->space -= remain_actual;
	}
2411

2412 2413
	ringbuf->space -= bytes;
	GEM_BUG_ON(ringbuf->space < 0);
2414
	return 0;
2415
}
2416

2417
/* Align the ring tail to a cacheline boundary */
2418
int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
2419
{
2420
	struct intel_engine_cs *engine = req->engine;
2421
	int num_dwords = (engine->buffer->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
2422 2423 2424 2425 2426
	int ret;

	if (num_dwords == 0)
		return 0;

2427
	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
2428
	ret = intel_ring_begin(req, num_dwords);
2429 2430 2431 2432
	if (ret)
		return ret;

	while (num_dwords--)
2433
		intel_ring_emit(engine, MI_NOOP);
2434

2435
	intel_ring_advance(engine);
2436 2437 2438 2439

	return 0;
}

2440
void intel_ring_init_seqno(struct intel_engine_cs *engine, u32 seqno)
2441
{
2442
	struct drm_i915_private *dev_priv = engine->i915;
2443

2444 2445 2446 2447 2448 2449 2450 2451
	/* Our semaphore implementation is strictly monotonic (i.e. we proceed
	 * so long as the semaphore value in the register/page is greater
	 * than the sync value), so whenever we reset the seqno,
	 * so long as we reset the tracking semaphore value to 0, it will
	 * always be before the next request's seqno. If we don't reset
	 * the semaphore value, then when the seqno moves backwards all
	 * future waits will complete instantly (causing rendering corruption).
	 */
2452
	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
2453 2454
		I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
		I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
2455
		if (HAS_VEBOX(dev_priv))
2456
			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
2457
	}
2458 2459 2460 2461 2462 2463 2464 2465
	if (dev_priv->semaphore_obj) {
		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
		void *semaphores = kmap(page);
		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
		kunmap(page);
	}
2466 2467
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2468

2469 2470 2471
	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
	if (engine->irq_seqno_barrier)
		engine->irq_seqno_barrier(engine);
2472
	engine->last_submitted_seqno = seqno;
2473

2474
	engine->hangcheck.seqno = seqno;
2475 2476 2477 2478 2479 2480 2481

	/* After manually advancing the seqno, fake the interrupt in case
	 * there are any waiters for that seqno.
	 */
	rcu_read_lock();
	intel_engine_wakeup(engine);
	rcu_read_unlock();
2482
}
2483

2484
static void gen6_bsd_ring_write_tail(struct intel_engine_cs *engine,
2485
				     u32 value)
2486
{
2487
	struct drm_i915_private *dev_priv = engine->i915;
2488

2489 2490
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);

2491
       /* Every tail move must follow the sequence below */
2492 2493 2494 2495

	/* Disable notification that the ring is IDLE. The GT
	 * will then assume that it is busy and bring it out of rc6.
	 */
2496 2497
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2498 2499

	/* Clear the context id. Here be magic! */
2500
	I915_WRITE64_FW(GEN6_BSD_RNCID, 0x0);
2501

2502
	/* Wait for the ring not to be idle, i.e. for it to wake up. */
2503 2504 2505 2506 2507
	if (intel_wait_for_register_fw(dev_priv,
				       GEN6_BSD_SLEEP_PSMI_CONTROL,
				       GEN6_BSD_SLEEP_INDICATOR,
				       0,
				       50))
2508
		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
2509

2510
	/* Now that the ring is fully powered up, update the tail */
2511 2512
	I915_WRITE_FW(RING_TAIL(engine->mmio_base), value);
	POSTING_READ_FW(RING_TAIL(engine->mmio_base));
2513 2514 2515 2516

	/* Let the ring send IDLE messages to the GT again,
	 * and so let it sleep to conserve power when idle.
	 */
2517 2518 2519 2520
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));

	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
2521 2522
}

2523
static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req,
2524
			       u32 invalidate, u32 flush)
2525
{
2526
	struct intel_engine_cs *engine = req->engine;
2527
	uint32_t cmd;
2528 2529
	int ret;

2530
	ret = intel_ring_begin(req, 4);
2531 2532 2533
	if (ret)
		return ret;

2534
	cmd = MI_FLUSH_DW;
2535
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2536
		cmd += 1;
2537 2538 2539 2540 2541 2542 2543 2544

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2545 2546 2547 2548 2549 2550
	/*
	 * Bspec vol 1c.5 - video engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2551
	if (invalidate & I915_GEM_GPU_DOMAINS)
2552 2553
		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;

2554 2555 2556
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2557
	if (INTEL_GEN(req->i915) >= 8) {
2558 2559
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2560
	} else  {
2561 2562
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2563
	}
2564
	intel_ring_advance(engine);
2565
	return 0;
2566 2567
}

2568
static int
2569
gen8_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2570
			      u64 offset, u32 len,
2571
			      unsigned dispatch_flags)
2572
{
2573
	struct intel_engine_cs *engine = req->engine;
2574
	bool ppgtt = USES_PPGTT(engine->dev) &&
2575
			!(dispatch_flags & I915_DISPATCH_SECURE);
2576 2577
	int ret;

2578
	ret = intel_ring_begin(req, 4);
2579 2580 2581 2582
	if (ret)
		return ret;

	/* FIXME(BDW): Address space and security selectors. */
2583
	intel_ring_emit(engine, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
2584 2585
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2586 2587 2588 2589
	intel_ring_emit(engine, lower_32_bits(offset));
	intel_ring_emit(engine, upper_32_bits(offset));
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
2590 2591 2592 2593

	return 0;
}

2594
static int
2595
hsw_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2596 2597
			     u64 offset, u32 len,
			     unsigned dispatch_flags)
2598
{
2599
	struct intel_engine_cs *engine = req->engine;
2600 2601
	int ret;

2602
	ret = intel_ring_begin(req, 2);
2603 2604 2605
	if (ret)
		return ret;

2606
	intel_ring_emit(engine,
2607
			MI_BATCH_BUFFER_START |
2608
			(dispatch_flags & I915_DISPATCH_SECURE ?
2609 2610 2611
			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2612
	/* bit0-7 is the length on GEN6+ */
2613 2614
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2615 2616 2617 2618

	return 0;
}

2619
static int
2620
gen6_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2621
			      u64 offset, u32 len,
2622
			      unsigned dispatch_flags)
2623
{
2624
	struct intel_engine_cs *engine = req->engine;
2625
	int ret;
2626

2627
	ret = intel_ring_begin(req, 2);
2628 2629
	if (ret)
		return ret;
2630

2631
	intel_ring_emit(engine,
2632
			MI_BATCH_BUFFER_START |
2633 2634
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
2635
	/* bit0-7 is the length on GEN6+ */
2636 2637
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2638

2639
	return 0;
2640 2641
}

2642 2643
/* Blitter support (SandyBridge+) */

2644
static int gen6_ring_flush(struct drm_i915_gem_request *req,
2645
			   u32 invalidate, u32 flush)
Z
Zou Nan hai 已提交
2646
{
2647
	struct intel_engine_cs *engine = req->engine;
2648
	uint32_t cmd;
2649 2650
	int ret;

2651
	ret = intel_ring_begin(req, 4);
2652 2653 2654
	if (ret)
		return ret;

2655
	cmd = MI_FLUSH_DW;
2656
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2657
		cmd += 1;
2658 2659 2660 2661 2662 2663 2664 2665

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2666 2667 2668 2669 2670 2671
	/*
	 * Bspec vol 1c.3 - blitter engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2672
	if (invalidate & I915_GEM_DOMAIN_RENDER)
2673
		cmd |= MI_INVALIDATE_TLB;
2674 2675 2676
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2677
	if (INTEL_GEN(req->i915) >= 8) {
2678 2679
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2680
	} else  {
2681 2682
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2683
	}
2684
	intel_ring_advance(engine);
R
Rodrigo Vivi 已提交
2685

2686
	return 0;
Z
Zou Nan hai 已提交
2687 2688
}

2689 2690 2691
static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
				       struct intel_engine_cs *engine)
{
2692
	struct drm_i915_gem_object *obj;
2693
	int ret, i;
2694 2695 2696 2697 2698

	if (!i915_semaphore_is_enabled(dev_priv))
		return;

	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore_obj) {
2699
		obj = i915_gem_object_create(&dev_priv->drm, 4096);
2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
		if (IS_ERR(obj)) {
			DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
			i915.semaphores = 0;
		} else {
			i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
			ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
			if (ret != 0) {
				drm_gem_object_unreference(&obj->base);
				DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
				i915.semaphores = 0;
			} else {
				dev_priv->semaphore_obj = obj;
			}
		}
	}

2716 2717 2718 2719
	if (!i915_semaphore_is_enabled(dev_priv))
		return;

	if (INTEL_GEN(dev_priv) >= 8) {
2720 2721
		u64 offset = i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj);

2722 2723
		engine->semaphore.sync_to = gen8_ring_sync;
		engine->semaphore.signal = gen8_xcs_signal;
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734

		for (i = 0; i < I915_NUM_ENGINES; i++) {
			u64 ring_offset;

			if (i != engine->id)
				ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i);
			else
				ring_offset = MI_SEMAPHORE_SYNC_INVALID;

			engine->semaphore.signal_ggtt[i] = ring_offset;
		}
2735 2736 2737
	} else if (INTEL_GEN(dev_priv) >= 6) {
		engine->semaphore.sync_to = gen6_ring_sync;
		engine->semaphore.signal = gen6_signal;
2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785

		/*
		 * The current semaphore is only applied on pre-gen8
		 * platform.  And there is no VCS2 ring on the pre-gen8
		 * platform. So the semaphore between RCS and VCS2 is
		 * initialized as INVALID.  Gen8 will initialize the
		 * sema between VCS2 and RCS later.
		 */
		for (i = 0; i < I915_NUM_ENGINES; i++) {
			static const struct {
				u32 wait_mbox;
				i915_reg_t mbox_reg;
			} sem_data[I915_NUM_ENGINES][I915_NUM_ENGINES] = {
				[RCS] = {
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
				},
				[VCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
				},
				[BCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
				},
				[VECS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
				},
			};
			u32 wait_mbox;
			i915_reg_t mbox_reg;

			if (i == engine->id || i == VCS2) {
				wait_mbox = MI_SEMAPHORE_SYNC_INVALID;
				mbox_reg = GEN6_NOSYNC;
			} else {
				wait_mbox = sem_data[engine->id][i].wait_mbox;
				mbox_reg = sem_data[engine->id][i].mbox_reg;
			}

			engine->semaphore.mbox.wait[i] = wait_mbox;
			engine->semaphore.mbox.signal[i] = mbox_reg;
		}
2786 2787 2788
	}
}

2789 2790 2791 2792
static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
				struct intel_engine_cs *engine)
{
	if (INTEL_GEN(dev_priv) >= 8) {
2793 2794
		engine->irq_enable = gen8_irq_enable;
		engine->irq_disable = gen8_irq_disable;
2795 2796
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 6) {
2797 2798
		engine->irq_enable = gen6_irq_enable;
		engine->irq_disable = gen6_irq_disable;
2799 2800
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 5) {
2801 2802
		engine->irq_enable = gen5_irq_enable;
		engine->irq_disable = gen5_irq_disable;
2803
		engine->irq_seqno_barrier = gen5_seqno_barrier;
2804
	} else if (INTEL_GEN(dev_priv) >= 3) {
2805 2806
		engine->irq_enable = i9xx_irq_enable;
		engine->irq_disable = i9xx_irq_disable;
2807
	} else {
2808 2809
		engine->irq_enable = i8xx_irq_enable;
		engine->irq_disable = i8xx_irq_disable;
2810 2811 2812
	}
}

2813 2814 2815
static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
				      struct intel_engine_cs *engine)
{
2816
	engine->init_hw = init_ring_common;
2817
	engine->write_tail = ring_write_tail;
2818

2819 2820
	engine->add_request = i9xx_add_request;
	if (INTEL_GEN(dev_priv) >= 6)
2821
		engine->add_request = gen6_add_request;
2822 2823 2824 2825

	if (INTEL_GEN(dev_priv) >= 8)
		engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
	else if (INTEL_GEN(dev_priv) >= 6)
2826
		engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
2827
	else if (INTEL_GEN(dev_priv) >= 4)
2828
		engine->dispatch_execbuffer = i965_dispatch_execbuffer;
2829 2830 2831 2832
	else if (IS_I830(dev_priv) || IS_845G(dev_priv))
		engine->dispatch_execbuffer = i830_dispatch_execbuffer;
	else
		engine->dispatch_execbuffer = i915_dispatch_execbuffer;
2833

2834
	intel_ring_init_irq(dev_priv, engine);
2835
	intel_ring_init_semaphores(dev_priv, engine);
2836 2837
}

2838 2839
int intel_init_render_ring_buffer(struct drm_device *dev)
{
2840
	struct drm_i915_private *dev_priv = to_i915(dev);
2841
	struct intel_engine_cs *engine = &dev_priv->engine[RCS];
2842
	int ret;
2843

2844 2845 2846
	engine->name = "render ring";
	engine->id = RCS;
	engine->exec_id = I915_EXEC_RENDER;
2847
	engine->hw_id = 0;
2848
	engine->mmio_base = RENDER_RING_BASE;
2849

2850 2851
	intel_ring_default_vfuncs(dev_priv, engine);

2852
	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2853 2854
	if (HAS_L3_DPF(dev_priv))
		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2855

2856
	if (INTEL_GEN(dev_priv) >= 8) {
2857
		engine->init_context = intel_rcs_ctx_init;
2858
		engine->add_request = gen8_render_add_request;
2859
		engine->flush = gen8_render_ring_flush;
2860
		if (i915_semaphore_is_enabled(dev_priv))
2861
			engine->semaphore.signal = gen8_rcs_signal;
2862
	} else if (INTEL_GEN(dev_priv) >= 6) {
2863 2864
		engine->init_context = intel_rcs_ctx_init;
		engine->flush = gen7_render_ring_flush;
2865
		if (IS_GEN6(dev_priv))
2866
			engine->flush = gen6_render_ring_flush;
2867
	} else if (IS_GEN5(dev_priv)) {
2868
		engine->flush = gen4_render_ring_flush;
2869
	} else {
2870
		if (INTEL_GEN(dev_priv) < 4)
2871
			engine->flush = gen2_render_ring_flush;
2872
		else
2873 2874
			engine->flush = gen4_render_ring_flush;
		engine->irq_enable_mask = I915_USER_INTERRUPT;
2875
	}
B
Ben Widawsky 已提交
2876

2877
	if (IS_HASWELL(dev_priv))
2878
		engine->dispatch_execbuffer = hsw_ring_dispatch_execbuffer;
2879

2880 2881
	engine->init_hw = init_render_ring;
	engine->cleanup = render_ring_cleanup;
2882

2883
	ret = intel_init_ring_buffer(dev, engine);
2884 2885 2886
	if (ret)
		return ret;

2887
	if (INTEL_GEN(dev_priv) >= 6) {
2888 2889 2890 2891 2892
		ret = intel_init_pipe_control(engine, 4096);
		if (ret)
			return ret;
	} else if (HAS_BROKEN_CS_TLB(dev_priv)) {
		ret = intel_init_pipe_control(engine, I830_WA_SIZE);
2893 2894 2895 2896 2897
		if (ret)
			return ret;
	}

	return 0;
2898 2899 2900 2901
}

int intel_init_bsd_ring_buffer(struct drm_device *dev)
{
2902
	struct drm_i915_private *dev_priv = to_i915(dev);
2903
	struct intel_engine_cs *engine = &dev_priv->engine[VCS];
2904

2905 2906 2907
	engine->name = "bsd ring";
	engine->id = VCS;
	engine->exec_id = I915_EXEC_BSD;
2908
	engine->hw_id = 1;
2909

2910 2911
	intel_ring_default_vfuncs(dev_priv, engine);

2912
	if (INTEL_GEN(dev_priv) >= 6) {
2913
		engine->mmio_base = GEN6_BSD_RING_BASE;
2914
		/* gen6 bsd needs a special wa for tail updates */
2915
		if (IS_GEN6(dev_priv))
2916 2917
			engine->write_tail = gen6_bsd_ring_write_tail;
		engine->flush = gen6_bsd_ring_flush;
2918
		if (INTEL_GEN(dev_priv) >= 8)
2919
			engine->irq_enable_mask =
2920
				GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
2921
		else
2922
			engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2923
	} else {
2924 2925
		engine->mmio_base = BSD_RING_BASE;
		engine->flush = bsd_ring_flush;
2926
		if (IS_GEN5(dev_priv))
2927
			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2928
		else
2929
			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2930 2931
	}

2932
	return intel_init_ring_buffer(dev, engine);
2933
}
2934

2935
/**
2936
 * Initialize the second BSD ring (eg. Broadwell GT3, Skylake GT3)
2937 2938 2939
 */
int intel_init_bsd2_ring_buffer(struct drm_device *dev)
{
2940
	struct drm_i915_private *dev_priv = to_i915(dev);
2941
	struct intel_engine_cs *engine = &dev_priv->engine[VCS2];
2942 2943 2944 2945

	engine->name = "bsd2 ring";
	engine->id = VCS2;
	engine->exec_id = I915_EXEC_BSD;
2946
	engine->hw_id = 4;
2947
	engine->mmio_base = GEN8_BSD2_RING_BASE;
2948 2949 2950

	intel_ring_default_vfuncs(dev_priv, engine);

2951 2952
	engine->flush = gen6_bsd_ring_flush;
	engine->irq_enable_mask =
2953 2954
			GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;

2955
	return intel_init_ring_buffer(dev, engine);
2956 2957
}

2958 2959
int intel_init_blt_ring_buffer(struct drm_device *dev)
{
2960
	struct drm_i915_private *dev_priv = to_i915(dev);
2961
	struct intel_engine_cs *engine = &dev_priv->engine[BCS];
2962 2963 2964 2965

	engine->name = "blitter ring";
	engine->id = BCS;
	engine->exec_id = I915_EXEC_BLT;
2966
	engine->hw_id = 2;
2967
	engine->mmio_base = BLT_RING_BASE;
2968 2969 2970

	intel_ring_default_vfuncs(dev_priv, engine);

2971
	engine->flush = gen6_ring_flush;
2972
	if (INTEL_GEN(dev_priv) >= 8)
2973
		engine->irq_enable_mask =
2974
			GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
2975
	else
2976
		engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2977

2978
	return intel_init_ring_buffer(dev, engine);
2979
}
2980

B
Ben Widawsky 已提交
2981 2982
int intel_init_vebox_ring_buffer(struct drm_device *dev)
{
2983
	struct drm_i915_private *dev_priv = to_i915(dev);
2984
	struct intel_engine_cs *engine = &dev_priv->engine[VECS];
B
Ben Widawsky 已提交
2985

2986 2987 2988
	engine->name = "video enhancement ring";
	engine->id = VECS;
	engine->exec_id = I915_EXEC_VEBOX;
2989
	engine->hw_id = 3;
2990
	engine->mmio_base = VEBOX_RING_BASE;
2991 2992 2993

	intel_ring_default_vfuncs(dev_priv, engine);

2994
	engine->flush = gen6_ring_flush;
2995

2996
	if (INTEL_GEN(dev_priv) >= 8) {
2997
		engine->irq_enable_mask =
2998
			GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
2999
	} else {
3000
		engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
3001 3002
		engine->irq_enable = hsw_vebox_irq_enable;
		engine->irq_disable = hsw_vebox_irq_disable;
3003
	}
B
Ben Widawsky 已提交
3004

3005
	return intel_init_ring_buffer(dev, engine);
B
Ben Widawsky 已提交
3006 3007
}

3008
int
3009
intel_ring_flush_all_caches(struct drm_i915_gem_request *req)
3010
{
3011
	struct intel_engine_cs *engine = req->engine;
3012 3013
	int ret;

3014
	if (!engine->gpu_caches_dirty)
3015 3016
		return 0;

3017
	ret = engine->flush(req, 0, I915_GEM_GPU_DOMAINS);
3018 3019 3020
	if (ret)
		return ret;

3021
	trace_i915_gem_ring_flush(req, 0, I915_GEM_GPU_DOMAINS);
3022

3023
	engine->gpu_caches_dirty = false;
3024 3025 3026 3027
	return 0;
}

int
3028
intel_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
3029
{
3030
	struct intel_engine_cs *engine = req->engine;
3031 3032 3033 3034
	uint32_t flush_domains;
	int ret;

	flush_domains = 0;
3035
	if (engine->gpu_caches_dirty)
3036 3037
		flush_domains = I915_GEM_GPU_DOMAINS;

3038
	ret = engine->flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3039 3040 3041
	if (ret)
		return ret;

3042
	trace_i915_gem_ring_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3043

3044
	engine->gpu_caches_dirty = false;
3045 3046
	return 0;
}
3047 3048

void
3049
intel_stop_engine(struct intel_engine_cs *engine)
3050 3051 3052
{
	int ret;

3053
	if (!intel_engine_initialized(engine))
3054 3055
		return;

3056
	ret = intel_engine_idle(engine);
3057
	if (ret)
3058
		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
3059
			  engine->name, ret);
3060

3061
	stop_ring(engine);
3062
}