intel_ringbuffer.c 86.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright © 2008-2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Zou Nan hai <nanhai.zou@intel.com>
 *    Xiang Hai hao<haihao.xiang@intel.com>
 *
 */

30
#include <linux/log2.h>
31
#include <drm/drmP.h>
32
#include "i915_drv.h"
33
#include <drm/i915_drm.h>
34
#include "i915_trace.h"
35
#include "intel_drv.h"
36

37 38 39 40 41
/* Rough estimate of the typical request size, performing a flush,
 * set-context and then emitting the batch.
 */
#define LEGACY_REQUEST_SIZE 200

42
int __intel_ring_space(int head, int tail, int size)
43
{
44 45
	int space = head - tail;
	if (space <= 0)
46
		space += size;
47
	return space - I915_RING_FREE_SPACE;
48 49
}

50 51 52 53 54 55 56 57 58 59 60
void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
{
	if (ringbuf->last_retired_head != -1) {
		ringbuf->head = ringbuf->last_retired_head;
		ringbuf->last_retired_head = -1;
	}

	ringbuf->space = __intel_ring_space(ringbuf->head & HEAD_ADDR,
					    ringbuf->tail, ringbuf->size);
}

61
bool intel_engine_stopped(struct intel_engine_cs *engine)
62
{
63
	struct drm_i915_private *dev_priv = engine->i915;
64
	return dev_priv->gpu_error.stop_rings & intel_engine_flag(engine);
65
}
66

67
static void __intel_ring_advance(struct intel_engine_cs *engine)
68
{
69
	struct intel_ringbuffer *ringbuf = engine->buffer;
70
	ringbuf->tail &= ringbuf->size - 1;
71
	if (intel_engine_stopped(engine))
72
		return;
73
	engine->write_tail(engine, ringbuf->tail);
74 75
}

76
static int
77
gen2_render_ring_flush(struct drm_i915_gem_request *req,
78 79 80
		       u32	invalidate_domains,
		       u32	flush_domains)
{
81
	struct intel_engine_cs *engine = req->engine;
82 83 84 85
	u32 cmd;
	int ret;

	cmd = MI_FLUSH;
86
	if (((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER) == 0)
87 88 89 90 91
		cmd |= MI_NO_WRITE_FLUSH;

	if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER)
		cmd |= MI_READ_FLUSH;

92
	ret = intel_ring_begin(req, 2);
93 94 95
	if (ret)
		return ret;

96 97 98
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
99 100 101 102 103

	return 0;
}

static int
104
gen4_render_ring_flush(struct drm_i915_gem_request *req,
105 106
		       u32	invalidate_domains,
		       u32	flush_domains)
107
{
108
	struct intel_engine_cs *engine = req->engine;
109
	u32 cmd;
110
	int ret;
111

112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
	/*
	 * read/write caches:
	 *
	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
	 * also flushed at 2d versus 3d pipeline switches.
	 *
	 * read-only caches:
	 *
	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
	 * MI_READ_FLUSH is set, and is always flushed on 965.
	 *
	 * I915_GEM_DOMAIN_COMMAND may not exist?
	 *
	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
	 * invalidated when MI_EXE_FLUSH is set.
	 *
	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
	 * invalidated with every MI_FLUSH.
	 *
	 * TLBs:
	 *
	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
	 * are flushed at any MI_FLUSH.
	 */

	cmd = MI_FLUSH | MI_NO_WRITE_FLUSH;
141
	if ((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER)
142 143 144
		cmd &= ~MI_NO_WRITE_FLUSH;
	if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
		cmd |= MI_EXE_FLUSH;
145

146
	if (invalidate_domains & I915_GEM_DOMAIN_COMMAND &&
147
	    (IS_G4X(req->i915) || IS_GEN5(req->i915)))
148
		cmd |= MI_INVALIDATE_ISP;
149

150
	ret = intel_ring_begin(req, 2);
151 152
	if (ret)
		return ret;
153

154 155 156
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
157 158

	return 0;
159 160
}

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
/**
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 * implementing two workarounds on gen6.  From section 1.4.7.1
 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 *
 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 * produced by non-pipelined state commands), software needs to first
 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 * 0.
 *
 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 *
 * And the workaround for these two requires this workaround first:
 *
 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 * BEFORE the pipe-control with a post-sync op and no write-cache
 * flushes.
 *
 * And this last workaround is tricky because of the requirements on
 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 * volume 2 part 1:
 *
 *     "1 of the following must also be set:
 *      - Render Target Cache Flush Enable ([12] of DW1)
 *      - Depth Cache Flush Enable ([0] of DW1)
 *      - Stall at Pixel Scoreboard ([1] of DW1)
 *      - Depth Stall ([13] of DW1)
 *      - Post-Sync Operation ([13] of DW1)
 *      - Notify Enable ([8] of DW1)"
 *
 * The cache flushes require the workaround flush that triggered this
 * one, so we can't use it.  Depth stall would trigger the same.
 * Post-sync nonzero is what triggered this second workaround, so we
 * can't use that one either.  Notify enable is IRQs, which aren't
 * really our business.  That leaves only stall at scoreboard.
 */
static int
199
intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
200
{
201
	struct intel_engine_cs *engine = req->engine;
202
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
203 204
	int ret;

205
	ret = intel_ring_begin(req, 6);
206 207 208
	if (ret)
		return ret;

209 210
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
211
			PIPE_CONTROL_STALL_AT_SCOREBOARD);
212 213 214 215 216
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0); /* low dword */
	intel_ring_emit(engine, 0); /* high dword */
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
217

218
	ret = intel_ring_begin(req, 6);
219 220 221
	if (ret)
		return ret;

222 223 224 225 226 227 228
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_QW_WRITE);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
229 230 231 232 233

	return 0;
}

static int
234 235
gen6_render_ring_flush(struct drm_i915_gem_request *req,
		       u32 invalidate_domains, u32 flush_domains)
236
{
237
	struct intel_engine_cs *engine = req->engine;
238
	u32 flags = 0;
239
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
240 241
	int ret;

242
	/* Force SNB workarounds for PIPE_CONTROL flushes */
243
	ret = intel_emit_post_sync_nonzero_flush(req);
244 245 246
	if (ret)
		return ret;

247 248 249 250
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
251 252 253 254 255 256 257
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		/*
		 * Ensure that any following seqno writes only happen
		 * when the render cache is indeed flushed.
		 */
258
		flags |= PIPE_CONTROL_CS_STALL;
259 260 261 262 263 264 265 266 267 268 269
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		/*
		 * TLB invalidate requires a post-sync write.
		 */
270
		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
271
	}
272

273
	ret = intel_ring_begin(req, 4);
274 275 276
	if (ret)
		return ret;

277 278 279 280 281
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
282 283 284 285

	return 0;
}

286
static int
287
gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
288
{
289
	struct intel_engine_cs *engine = req->engine;
290 291
	int ret;

292
	ret = intel_ring_begin(req, 4);
293 294 295
	if (ret)
		return ret;

296 297
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
298
			      PIPE_CONTROL_STALL_AT_SCOREBOARD);
299 300 301
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
302 303 304 305

	return 0;
}

306
static int
307
gen7_render_ring_flush(struct drm_i915_gem_request *req,
308 309
		       u32 invalidate_domains, u32 flush_domains)
{
310
	struct intel_engine_cs *engine = req->engine;
311
	u32 flags = 0;
312
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
313 314
	int ret;

315 316 317 318 319 320 321 322 323 324
	/*
	 * Ensure that any following seqno writes only happen when the render
	 * cache is indeed flushed.
	 *
	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
	 * don't try to be clever and just set it unconditionally.
	 */
	flags |= PIPE_CONTROL_CS_STALL;

325 326 327 328 329 330 331
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
332
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
333
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
334 335 336 337 338 339 340 341
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
342
		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
343 344 345 346
		/*
		 * TLB invalidate requires a post-sync write.
		 */
		flags |= PIPE_CONTROL_QW_WRITE;
347
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
348

349 350
		flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;

351 352 353
		/* Workaround: we must issue a pipe_control with CS-stall bit
		 * set before a pipe_control command that has the state cache
		 * invalidate bit set. */
354
		gen7_render_ring_cs_stall_wa(req);
355 356
	}

357
	ret = intel_ring_begin(req, 4);
358 359 360
	if (ret)
		return ret;

361 362 363 364 365
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
366 367 368 369

	return 0;
}

370
static int
371
gen8_emit_pipe_control(struct drm_i915_gem_request *req,
372 373
		       u32 flags, u32 scratch_addr)
{
374
	struct intel_engine_cs *engine = req->engine;
375 376
	int ret;

377
	ret = intel_ring_begin(req, 6);
378 379 380
	if (ret)
		return ret;

381 382 383 384 385 386 387
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
388 389 390 391

	return 0;
}

B
Ben Widawsky 已提交
392
static int
393
gen8_render_ring_flush(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
394 395 396
		       u32 invalidate_domains, u32 flush_domains)
{
	u32 flags = 0;
397
	u32 scratch_addr = req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
398
	int ret;
B
Ben Widawsky 已提交
399 400 401 402 403 404

	flags |= PIPE_CONTROL_CS_STALL;

	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
405
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
406
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
B
Ben Widawsky 已提交
407 408 409 410 411 412 413 414 415 416
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
417 418

		/* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
419
		ret = gen8_emit_pipe_control(req,
420 421 422 423 424
					     PIPE_CONTROL_CS_STALL |
					     PIPE_CONTROL_STALL_AT_SCOREBOARD,
					     0);
		if (ret)
			return ret;
B
Ben Widawsky 已提交
425 426
	}

427
	return gen8_emit_pipe_control(req, flags, scratch_addr);
B
Ben Widawsky 已提交
428 429
}

430
static void ring_write_tail(struct intel_engine_cs *engine,
431
			    u32 value)
432
{
433
	struct drm_i915_private *dev_priv = engine->i915;
434
	I915_WRITE_TAIL(engine, value);
435 436
}

437
u64 intel_ring_get_active_head(struct intel_engine_cs *engine)
438
{
439
	struct drm_i915_private *dev_priv = engine->i915;
440
	u64 acthd;
441

442
	if (INTEL_GEN(dev_priv) >= 8)
443 444
		acthd = I915_READ64_2x32(RING_ACTHD(engine->mmio_base),
					 RING_ACTHD_UDW(engine->mmio_base));
445
	else if (INTEL_GEN(dev_priv) >= 4)
446
		acthd = I915_READ(RING_ACTHD(engine->mmio_base));
447 448 449 450
	else
		acthd = I915_READ(ACTHD);

	return acthd;
451 452
}

453
static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
454
{
455
	struct drm_i915_private *dev_priv = engine->i915;
456 457 458
	u32 addr;

	addr = dev_priv->status_page_dmah->busaddr;
459
	if (INTEL_GEN(dev_priv) >= 4)
460 461 462 463
		addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0;
	I915_WRITE(HWS_PGA, addr);
}

464
static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
465
{
466
	struct drm_i915_private *dev_priv = engine->i915;
467
	i915_reg_t mmio;
468 469 470 471

	/* The ring status page addresses are no longer next to the rest of
	 * the ring registers as of gen7.
	 */
472
	if (IS_GEN7(dev_priv)) {
473
		switch (engine->id) {
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
		case RCS:
			mmio = RENDER_HWS_PGA_GEN7;
			break;
		case BCS:
			mmio = BLT_HWS_PGA_GEN7;
			break;
		/*
		 * VCS2 actually doesn't exist on Gen7. Only shut up
		 * gcc switch check warning
		 */
		case VCS2:
		case VCS:
			mmio = BSD_HWS_PGA_GEN7;
			break;
		case VECS:
			mmio = VEBOX_HWS_PGA_GEN7;
			break;
		}
492
	} else if (IS_GEN6(dev_priv)) {
493
		mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
494 495
	} else {
		/* XXX: gen8 returns to sanity */
496
		mmio = RING_HWS_PGA(engine->mmio_base);
497 498
	}

499
	I915_WRITE(mmio, (u32)engine->status_page.gfx_addr);
500 501 502 503 504 505 506 507 508
	POSTING_READ(mmio);

	/*
	 * Flush the TLB for this page
	 *
	 * FIXME: These two bits have disappeared on gen8, so a question
	 * arises: do we still need this and if so how should we go about
	 * invalidating the TLB?
	 */
509
	if (IS_GEN(dev_priv, 6, 7)) {
510
		i915_reg_t reg = RING_INSTPM(engine->mmio_base);
511 512

		/* ring should be idle before issuing a sync flush*/
513
		WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
514 515 516 517

		I915_WRITE(reg,
			   _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
					      INSTPM_SYNC_FLUSH));
518 519 520
		if (intel_wait_for_register(dev_priv,
					    reg, INSTPM_SYNC_FLUSH, 0,
					    1000))
521
			DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
522
				  engine->name);
523 524 525
	}
}

526
static bool stop_ring(struct intel_engine_cs *engine)
527
{
528
	struct drm_i915_private *dev_priv = engine->i915;
529

530
	if (!IS_GEN2(dev_priv)) {
531
		I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
532 533 534 535 536
		if (intel_wait_for_register(dev_priv,
					    RING_MI_MODE(engine->mmio_base),
					    MODE_IDLE,
					    MODE_IDLE,
					    1000)) {
537 538
			DRM_ERROR("%s : timed out trying to stop ring\n",
				  engine->name);
539 540 541 542
			/* Sometimes we observe that the idle flag is not
			 * set even though the ring is empty. So double
			 * check before giving up.
			 */
543
			if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine))
544
				return false;
545 546
		}
	}
547

548 549 550
	I915_WRITE_CTL(engine, 0);
	I915_WRITE_HEAD(engine, 0);
	engine->write_tail(engine, 0);
551

552
	if (!IS_GEN2(dev_priv)) {
553 554
		(void)I915_READ_CTL(engine);
		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
555
	}
556

557
	return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
558
}
559

560 561 562 563 564
void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
{
	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
}

565
static int init_ring_common(struct intel_engine_cs *engine)
566
{
567
	struct drm_i915_private *dev_priv = engine->i915;
568
	struct intel_ringbuffer *ringbuf = engine->buffer;
569
	struct drm_i915_gem_object *obj = ringbuf->obj;
570 571
	int ret = 0;

572
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
573

574
	if (!stop_ring(engine)) {
575
		/* G45 ring initialization often fails to reset head to zero */
576 577
		DRM_DEBUG_KMS("%s head not reset to zero "
			      "ctl %08x head %08x tail %08x start %08x\n",
578 579 580 581 582
			      engine->name,
			      I915_READ_CTL(engine),
			      I915_READ_HEAD(engine),
			      I915_READ_TAIL(engine),
			      I915_READ_START(engine));
583

584
		if (!stop_ring(engine)) {
585 586
			DRM_ERROR("failed to set %s head to zero "
				  "ctl %08x head %08x tail %08x start %08x\n",
587 588 589 590 591
				  engine->name,
				  I915_READ_CTL(engine),
				  I915_READ_HEAD(engine),
				  I915_READ_TAIL(engine),
				  I915_READ_START(engine));
592 593
			ret = -EIO;
			goto out;
594
		}
595 596
	}

597
	if (I915_NEED_GFX_HWS(dev_priv))
598
		intel_ring_setup_status_page(engine);
599
	else
600
		ring_setup_phys_status_page(engine);
601

602
	/* Enforce ordering by reading HEAD register back */
603
	I915_READ_HEAD(engine);
604

605 606 607 608
	/* Initialize the ring. This must happen _after_ we've cleared the ring
	 * registers with the above sequence (the readback of the HEAD registers
	 * also enforces ordering), otherwise the hw might lose the new ring
	 * register values. */
609
	I915_WRITE_START(engine, i915_gem_obj_ggtt_offset(obj));
610 611

	/* WaClearRingBufHeadRegAtInit:ctg,elk */
612
	if (I915_READ_HEAD(engine))
613
		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
614 615 616
			  engine->name, I915_READ_HEAD(engine));
	I915_WRITE_HEAD(engine, 0);
	(void)I915_READ_HEAD(engine);
617

618
	I915_WRITE_CTL(engine,
619
			((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES)
620
			| RING_VALID);
621 622

	/* If the head is still not zero, the ring is dead */
623 624 625
	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
		     I915_READ_START(engine) == i915_gem_obj_ggtt_offset(obj) &&
		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
626
		DRM_ERROR("%s initialization failed "
627
			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08lx]\n",
628 629 630 631 632 633
			  engine->name,
			  I915_READ_CTL(engine),
			  I915_READ_CTL(engine) & RING_VALID,
			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
			  I915_READ_START(engine),
			  (unsigned long)i915_gem_obj_ggtt_offset(obj));
634 635
		ret = -EIO;
		goto out;
636 637
	}

638
	ringbuf->last_retired_head = -1;
639 640
	ringbuf->head = I915_READ_HEAD(engine);
	ringbuf->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
641
	intel_ring_update_space(ringbuf);
642

643
	intel_engine_init_hangcheck(engine);
644

645
out:
646
	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
647 648

	return ret;
649 650
}

651
void intel_fini_pipe_control(struct intel_engine_cs *engine)
652
{
653
	if (engine->scratch.obj == NULL)
654 655
		return;

656
	i915_gem_object_ggtt_unpin(engine->scratch.obj);
657 658
	drm_gem_object_unreference(&engine->scratch.obj->base);
	engine->scratch.obj = NULL;
659 660
}

661
int intel_init_pipe_control(struct intel_engine_cs *engine, int size)
662
{
663
	struct drm_i915_gem_object *obj;
664 665
	int ret;

666
	WARN_ON(engine->scratch.obj);
667

668
	obj = i915_gem_object_create_stolen(engine->i915->dev, size);
669
	if (!obj)
670
		obj = i915_gem_object_create(engine->i915->dev, size);
671 672 673
	if (IS_ERR(obj)) {
		DRM_ERROR("Failed to allocate scratch page\n");
		ret = PTR_ERR(obj);
674 675
		goto err;
	}
676

677
	ret = i915_gem_obj_ggtt_pin(obj, 4096, PIN_HIGH);
678 679
	if (ret)
		goto err_unref;
680

681 682
	engine->scratch.obj = obj;
	engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(obj);
683
	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
684
			 engine->name, engine->scratch.gtt_offset);
685 686 687
	return 0;

err_unref:
688
	drm_gem_object_unreference(&engine->scratch.obj->base);
689 690 691 692
err:
	return ret;
}

693
static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
694
{
695
	struct intel_engine_cs *engine = req->engine;
696 697
	struct i915_workarounds *w = &req->i915->workarounds;
	int ret, i;
698

699
	if (w->count == 0)
700
		return 0;
701

702
	engine->gpu_caches_dirty = true;
703
	ret = intel_ring_flush_all_caches(req);
704 705
	if (ret)
		return ret;
706

707
	ret = intel_ring_begin(req, (w->count * 2 + 2));
708 709 710
	if (ret)
		return ret;

711
	intel_ring_emit(engine, MI_LOAD_REGISTER_IMM(w->count));
712
	for (i = 0; i < w->count; i++) {
713 714
		intel_ring_emit_reg(engine, w->reg[i].addr);
		intel_ring_emit(engine, w->reg[i].value);
715
	}
716
	intel_ring_emit(engine, MI_NOOP);
717

718
	intel_ring_advance(engine);
719

720
	engine->gpu_caches_dirty = true;
721
	ret = intel_ring_flush_all_caches(req);
722 723
	if (ret)
		return ret;
724

725
	DRM_DEBUG_DRIVER("Number of Workarounds emitted: %d\n", w->count);
726

727
	return 0;
728 729
}

730
static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
731 732 733
{
	int ret;

734
	ret = intel_ring_workarounds_emit(req);
735 736 737
	if (ret != 0)
		return ret;

738
	ret = i915_gem_render_state_init(req);
739
	if (ret)
740
		return ret;
741

742
	return 0;
743 744
}

745
static int wa_add(struct drm_i915_private *dev_priv,
746 747
		  i915_reg_t addr,
		  const u32 mask, const u32 val)
748 749 750 751 752 753 754 755 756 757 758 759 760
{
	const u32 idx = dev_priv->workarounds.count;

	if (WARN_ON(idx >= I915_MAX_WA_REGS))
		return -ENOSPC;

	dev_priv->workarounds.reg[idx].addr = addr;
	dev_priv->workarounds.reg[idx].value = val;
	dev_priv->workarounds.reg[idx].mask = mask;

	dev_priv->workarounds.count++;

	return 0;
761 762
}

763
#define WA_REG(addr, mask, val) do { \
764
		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
765 766
		if (r) \
			return r; \
767
	} while (0)
768 769

#define WA_SET_BIT_MASKED(addr, mask) \
770
	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
771 772

#define WA_CLR_BIT_MASKED(addr, mask) \
773
	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
774

775
#define WA_SET_FIELD_MASKED(addr, mask, value) \
776
	WA_REG(addr, mask, _MASKED_FIELD(mask, value))
777

778 779
#define WA_SET_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) | (mask))
#define WA_CLR_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) & ~(mask))
780

781
#define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val)
782

783 784
static int wa_ring_whitelist_reg(struct intel_engine_cs *engine,
				 i915_reg_t reg)
785
{
786
	struct drm_i915_private *dev_priv = engine->i915;
787
	struct i915_workarounds *wa = &dev_priv->workarounds;
788
	const uint32_t index = wa->hw_whitelist_count[engine->id];
789 790 791 792

	if (WARN_ON(index >= RING_MAX_NONPRIV_SLOTS))
		return -EINVAL;

793
	WA_WRITE(RING_FORCE_TO_NONPRIV(engine->mmio_base, index),
794
		 i915_mmio_reg_offset(reg));
795
	wa->hw_whitelist_count[engine->id]++;
796 797 798 799

	return 0;
}

800
static int gen8_init_workarounds(struct intel_engine_cs *engine)
801
{
802
	struct drm_i915_private *dev_priv = engine->i915;
803 804

	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
805

806 807 808
	/* WaDisableAsyncFlipPerfMode:bdw,chv */
	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);

809 810 811 812
	/* WaDisablePartialInstShootdown:bdw,chv */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

813 814 815 816 817
	/* Use Force Non-Coherent whenever executing a 3D context. This is a
	 * workaround for for a possible hang in the unlikely event a TLB
	 * invalidation occurs during a PSD flush.
	 */
	/* WaForceEnableNonCoherent:bdw,chv */
818
	/* WaHdcDisableFetchWhenMasked:bdw,chv */
819
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
820
			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
821 822
			  HDC_FORCE_NON_COHERENT);

823 824 825 826 827 828 829 830 831 832
	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
	 *  polygons in the same 8x4 pixel/sample area to be processed without
	 *  stalling waiting for the earlier ones to write to Hierarchical Z
	 *  buffer."
	 *
	 * This optimization is off by default for BDW and CHV; turn it on.
	 */
	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);

833 834 835
	/* Wa4x4STCOptimizationDisable:bdw,chv */
	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);

836 837 838 839 840 841 842 843 844 845 846 847
	/*
	 * BSpec recommends 8x4 when MSAA is used,
	 * however in practice 16x4 seems fastest.
	 *
	 * Note that PS/WM thread counts depend on the WIZ hashing
	 * disable bit, which we don't touch here, but it's good
	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
	 */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN6_WIZ_HASHING_MASK,
			    GEN6_WIZ_HASHING_16x4);

848 849 850
	return 0;
}

851
static int bdw_init_workarounds(struct intel_engine_cs *engine)
852
{
853
	struct drm_i915_private *dev_priv = engine->i915;
854
	int ret;
855

856
	ret = gen8_init_workarounds(engine);
857 858 859
	if (ret)
		return ret;

860
	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
861
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
862

863
	/* WaDisableDopClockGating:bdw */
864 865
	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
			  DOP_CLOCK_GATING_DISABLE);
866

867 868
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN8_SAMPLER_POWER_BYPASS_DIS);
869

870
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
871 872 873
			  /* WaForceContextSaveRestoreNonCoherent:bdw */
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
874
			  (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
875 876 877 878

	return 0;
}

879
static int chv_init_workarounds(struct intel_engine_cs *engine)
880
{
881
	struct drm_i915_private *dev_priv = engine->i915;
882
	int ret;
883

884
	ret = gen8_init_workarounds(engine);
885 886 887
	if (ret)
		return ret;

888
	/* WaDisableThreadStallDopClockGating:chv */
889
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
890

891 892 893
	/* Improve HiZ throughput on CHV. */
	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);

894 895 896
	return 0;
}

897
static int gen9_init_workarounds(struct intel_engine_cs *engine)
898
{
899
	struct drm_i915_private *dev_priv = engine->i915;
900
	int ret;
901

902 903 904
	/* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl */
	I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));

905
	/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl */
906 907 908
	I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
		   GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);

909
	/* WaDisableKillLogic:bxt,skl,kbl */
910 911 912
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   ECOCHK_DIS_TLB);

913 914
	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl */
	/* WaDisablePartialInstShootdown:skl,bxt,kbl */
915
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
916
			  FLOW_CONTROL_ENABLE |
917 918
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

919
	/* Syncing dependencies between camera and graphics:skl,bxt,kbl */
920 921 922
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);

923
	/* WaDisableDgMirrorFixInHalfSliceChicken5:skl,bxt */
924 925
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
926 927
		WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
				  GEN9_DG_MIRROR_FIX_ENABLE);
928

929
	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
930 931
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
932 933
		WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
				  GEN9_RHWO_OPTIMIZATION_DISABLE);
934 935 936 937 938
		/*
		 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
		 * but we do that in per ctx batchbuffer as there is an issue
		 * with this register not getting restored on ctx restore
		 */
939 940
	}

941 942
	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl */
	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl */
943 944 945
	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
			  GEN9_ENABLE_YV12_BUGFIX |
			  GEN9_ENABLE_GPGPU_PREEMPTION);
946

947 948
	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl */
	/* WaDisablePartialResolveInVc:skl,bxt,kbl */
949 950
	WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
					 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
951

952
	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl */
953 954 955
	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
			  GEN9_CCS_TLB_PREFETCH_ENABLE);

956
	/* WaDisableMaskBasedCammingInRCC:skl,bxt */
957 958
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_C0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
959 960 961
		WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
				  PIXEL_MASK_CAMMING_DISABLE);

962 963 964 965
	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
966

967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
	 * both tied to WaForceContextSaveRestoreNonCoherent
	 * in some hsds for skl. We keep the tie for all gen9. The
	 * documentation is a bit hazy and so we want to get common behaviour,
	 * even though there is no clear evidence we would need both on kbl/bxt.
	 * This area has been source of system hangs so we play it safe
	 * and mimic the skl regardless of what bspec says.
	 *
	 * Use Force Non-Coherent whenever executing a 3D context. This
	 * is a workaround for a possible hang in the unlikely event
	 * a TLB invalidation occurs during a PSD flush.
	 */

	/* WaForceEnableNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_NON_COHERENT);

	/* WaDisableHDCInvalidation:skl,bxt,kbl */
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   BDW_DISABLE_HDC_INVALIDATION);

988 989 990 991
	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl */
	if (IS_SKYLAKE(dev_priv) ||
	    IS_KABYLAKE(dev_priv) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
992 993 994
		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
				  GEN8_SAMPLER_POWER_BYPASS_DIS);

995
	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl */
996 997
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);

998
	/* WaOCLCoherentLineFlush:skl,bxt,kbl */
999 1000 1001
	I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) |
				    GEN8_LQSC_FLUSH_COHERENT_LINES));

1002 1003 1004 1005 1006
	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */
	ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
	if (ret)
		return ret;

1007
	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl */
1008
	ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1);
1009 1010 1011
	if (ret)
		return ret;

1012
	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl */
1013
	ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1);
1014 1015 1016
	if (ret)
		return ret;

1017 1018 1019
	return 0;
}

1020
static int skl_tune_iz_hashing(struct intel_engine_cs *engine)
1021
{
1022
	struct drm_i915_private *dev_priv = engine->i915;
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
	u8 vals[3] = { 0, 0, 0 };
	unsigned int i;

	for (i = 0; i < 3; i++) {
		u8 ss;

		/*
		 * Only consider slices where one, and only one, subslice has 7
		 * EUs
		 */
1033
		if (!is_power_of_2(dev_priv->info.subslice_7eu[i]))
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
			continue;

		/*
		 * subslice_7eu[i] != 0 (because of the check above) and
		 * ss_max == 4 (maximum number of subslices possible per slice)
		 *
		 * ->    0 <= ss <= 3;
		 */
		ss = ffs(dev_priv->info.subslice_7eu[i]) - 1;
		vals[i] = 3 - ss;
	}

	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
		return 0;

	/* Tune IZ hashing. See intel_device_info_runtime_init() */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN9_IZ_HASHING_MASK(2) |
			    GEN9_IZ_HASHING_MASK(1) |
			    GEN9_IZ_HASHING_MASK(0),
			    GEN9_IZ_HASHING(2, vals[2]) |
			    GEN9_IZ_HASHING(1, vals[1]) |
			    GEN9_IZ_HASHING(0, vals[0]));

	return 0;
}

1061
static int skl_init_workarounds(struct intel_engine_cs *engine)
1062
{
1063
	struct drm_i915_private *dev_priv = engine->i915;
1064
	int ret;
1065

1066
	ret = gen9_init_workarounds(engine);
1067 1068
	if (ret)
		return ret;
1069

1070 1071 1072 1073 1074
	/*
	 * Actual WA is to disable percontext preemption granularity control
	 * until D0 which is the default case so this is equivalent to
	 * !WaDisablePerCtxtPreemptionGranularityControl:skl
	 */
1075
	if (IS_SKL_REVID(dev_priv, SKL_REVID_E0, REVID_FOREVER)) {
1076 1077 1078 1079
		I915_WRITE(GEN7_FF_SLICE_CS_CHICKEN1,
			   _MASKED_BIT_ENABLE(GEN9_FFSC_PERCTX_PREEMPT_CTRL));
	}

1080
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0)) {
1081 1082 1083 1084 1085 1086 1087 1088
		/* WaDisableChickenBitTSGBarrierAckForFFSliceCS:skl */
		I915_WRITE(FF_SLICE_CS_CHICKEN2,
			   _MASKED_BIT_ENABLE(GEN9_TSG_BARRIER_ACK_DISABLE));
	}

	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
1089
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0))
1090 1091 1092 1093 1094
		/* WaDisableLSQCROPERFforOCL:skl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

	/* WaEnableGapsTsvCreditFix:skl */
1095
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, REVID_FOREVER)) {
1096 1097 1098 1099
		I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
					   GEN9_GAPS_TSV_CREDIT_DISABLE));
	}

1100
	/* WaDisablePowerCompilerClockGating:skl */
1101
	if (IS_SKL_REVID(dev_priv, SKL_REVID_B0, SKL_REVID_B0))
1102 1103 1104
		WA_SET_BIT_MASKED(HIZ_CHICKEN,
				  BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);

1105
	/* WaBarrierPerformanceFixDisable:skl */
1106
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_D0))
1107 1108 1109 1110
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE |
				  HDC_BARRIER_PERFORMANCE_DISABLE);

1111
	/* WaDisableSbeCacheDispatchPortSharing:skl */
1112
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_F0))
1113 1114 1115 1116
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1117 1118 1119
	/* WaDisableGafsUnitClkGating:skl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1120
	/* WaDisableLSQCROPERFforOCL:skl */
1121
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1122 1123 1124
	if (ret)
		return ret;

1125
	return skl_tune_iz_hashing(engine);
1126 1127
}

1128
static int bxt_init_workarounds(struct intel_engine_cs *engine)
1129
{
1130
	struct drm_i915_private *dev_priv = engine->i915;
1131
	int ret;
1132

1133
	ret = gen9_init_workarounds(engine);
1134 1135
	if (ret)
		return ret;
1136

1137 1138
	/* WaStoreMultiplePTEenable:bxt */
	/* This is a requirement according to Hardware specification */
1139
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
1140 1141 1142
		I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF);

	/* WaSetClckGatingDisableMedia:bxt */
1143
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1144 1145 1146 1147
		I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
					    ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE));
	}

1148 1149 1150 1151
	/* WaDisableThreadStallDopClockGating:bxt */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  STALL_DOP_GATING_DISABLE);

1152 1153 1154 1155 1156 1157
	/* WaDisablePooledEuLoadBalancingFix:bxt */
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) {
		WA_SET_BIT_MASKED(FF_SLICE_CS_CHICKEN2,
				  GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
	}

1158
	/* WaDisableSbeCacheDispatchPortSharing:bxt */
1159
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) {
1160 1161 1162 1163 1164
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
	}

1165 1166 1167
	/* WaDisableObjectLevelPreemptionForTrifanOrPolygon:bxt */
	/* WaDisableObjectLevelPreemptionForInstancedDraw:bxt */
	/* WaDisableObjectLevelPreemtionForInstanceId:bxt */
1168
	/* WaDisableLSQCROPERFforOCL:bxt */
1169
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1170
		ret = wa_ring_whitelist_reg(engine, GEN9_CS_DEBUG_MODE1);
1171 1172
		if (ret)
			return ret;
1173

1174
		ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1175 1176
		if (ret)
			return ret;
1177 1178
	}

1179
	/* WaProgramL3SqcReg1DefaultForPerf:bxt */
1180
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
1181 1182
		I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
					   L3_HIGH_PRIO_CREDITS(2));
1183

1184 1185 1186 1187 1188
	/* WaInsertDummyPushConstPs:bxt */
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1189 1190 1191
	return 0;
}

1192 1193
static int kbl_init_workarounds(struct intel_engine_cs *engine)
{
1194
	struct drm_i915_private *dev_priv = engine->i915;
1195 1196 1197 1198 1199 1200
	int ret;

	ret = gen9_init_workarounds(engine);
	if (ret)
		return ret;

1201 1202 1203 1204
	/* WaEnableGapsTsvCreditFix:kbl */
	I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
				   GEN9_GAPS_TSV_CREDIT_DISABLE));

1205 1206 1207 1208 1209
	/* WaDisableDynamicCreditSharing:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT(GAMT_CHKN_BIT_REG,
			   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);

1210 1211 1212 1213 1214
	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE);

1215 1216 1217 1218 1219 1220 1221 1222
	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_E0))
		/* WaDisableLSQCROPERFforOCL:kbl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

1223 1224 1225 1226 1227
	/* WaInsertDummyPushConstPs:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1228 1229 1230
	/* WaDisableGafsUnitClkGating:kbl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1231 1232 1233 1234 1235
	/* WaDisableSbeCacheDispatchPortSharing:kbl */
	WA_SET_BIT_MASKED(
		GEN7_HALF_SLICE_CHICKEN1,
		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1236 1237 1238 1239 1240
	/* WaDisableLSQCROPERFforOCL:kbl */
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
	if (ret)
		return ret;

1241 1242 1243
	return 0;
}

1244
int init_workarounds_ring(struct intel_engine_cs *engine)
1245
{
1246
	struct drm_i915_private *dev_priv = engine->i915;
1247

1248
	WARN_ON(engine->id != RCS);
1249 1250

	dev_priv->workarounds.count = 0;
1251
	dev_priv->workarounds.hw_whitelist_count[RCS] = 0;
1252

1253
	if (IS_BROADWELL(dev_priv))
1254
		return bdw_init_workarounds(engine);
1255

1256
	if (IS_CHERRYVIEW(dev_priv))
1257
		return chv_init_workarounds(engine);
1258

1259
	if (IS_SKYLAKE(dev_priv))
1260
		return skl_init_workarounds(engine);
1261

1262
	if (IS_BROXTON(dev_priv))
1263
		return bxt_init_workarounds(engine);
1264

1265 1266 1267
	if (IS_KABYLAKE(dev_priv))
		return kbl_init_workarounds(engine);

1268 1269 1270
	return 0;
}

1271
static int init_render_ring(struct intel_engine_cs *engine)
1272
{
1273
	struct drm_i915_private *dev_priv = engine->i915;
1274
	int ret = init_ring_common(engine);
1275 1276
	if (ret)
		return ret;
1277

1278
	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1279
	if (IS_GEN(dev_priv, 4, 6))
1280
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
1281 1282 1283 1284

	/* We need to disable the AsyncFlip performance optimisations in order
	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
	 * programmed to '1' on all products.
1285
	 *
1286
	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1287
	 */
1288
	if (IS_GEN(dev_priv, 6, 7))
1289 1290
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));

1291
	/* Required for the hardware to program scanline values for waiting */
1292
	/* WaEnableFlushTlbInvalidationMode:snb */
1293
	if (IS_GEN6(dev_priv))
1294
		I915_WRITE(GFX_MODE,
1295
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
1296

1297
	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1298
	if (IS_GEN7(dev_priv))
1299
		I915_WRITE(GFX_MODE_GEN7,
1300
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
1301
			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
1302

1303
	if (IS_GEN6(dev_priv)) {
1304 1305 1306 1307 1308 1309
		/* From the Sandybridge PRM, volume 1 part 3, page 24:
		 * "If this bit is set, STCunit will have LRA as replacement
		 *  policy. [...] This bit must be reset.  LRA replacement
		 *  policy is not supported."
		 */
		I915_WRITE(CACHE_MODE_0,
1310
			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
1311 1312
	}

1313
	if (IS_GEN(dev_priv, 6, 7))
1314
		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1315

1316 1317
	if (HAS_L3_DPF(dev_priv))
		I915_WRITE_IMR(engine, ~GT_PARITY_ERROR(dev_priv));
1318

1319
	return init_workarounds_ring(engine);
1320 1321
}

1322
static void render_ring_cleanup(struct intel_engine_cs *engine)
1323
{
1324
	struct drm_i915_private *dev_priv = engine->i915;
1325 1326 1327 1328 1329 1330

	if (dev_priv->semaphore_obj) {
		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
		drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
		dev_priv->semaphore_obj = NULL;
	}
1331

1332
	intel_fini_pipe_control(engine);
1333 1334
}

1335
static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
1336 1337 1338
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 8
1339
	struct intel_engine_cs *signaller = signaller_req->engine;
1340
	struct drm_i915_private *dev_priv = signaller_req->i915;
1341
	struct intel_engine_cs *waiter;
1342 1343
	enum intel_engine_id id;
	int ret, num_rings;
1344

1345
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1346 1347 1348
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1349
	ret = intel_ring_begin(signaller_req, num_dwords);
1350 1351 1352
	if (ret)
		return ret;

1353 1354
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1355 1356 1357 1358 1359 1360
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
					   PIPE_CONTROL_QW_WRITE |
1361
					   PIPE_CONTROL_CS_STALL);
1362 1363
		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1364
		intel_ring_emit(signaller, signaller_req->seqno);
1365 1366
		intel_ring_emit(signaller, 0);
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1367
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1368 1369 1370 1371 1372 1373
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1374
static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
1375 1376 1377
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 6
1378
	struct intel_engine_cs *signaller = signaller_req->engine;
1379
	struct drm_i915_private *dev_priv = signaller_req->i915;
1380
	struct intel_engine_cs *waiter;
1381 1382
	enum intel_engine_id id;
	int ret, num_rings;
1383

1384
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1385 1386 1387
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1388
	ret = intel_ring_begin(signaller_req, num_dwords);
1389 1390 1391
	if (ret)
		return ret;

1392 1393
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1394 1395 1396 1397 1398 1399 1400 1401
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
					   MI_FLUSH_DW_OP_STOREDW);
		intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
					   MI_FLUSH_DW_USE_GTT);
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1402
		intel_ring_emit(signaller, signaller_req->seqno);
1403
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1404
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1405 1406 1407 1408 1409 1410
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1411
static int gen6_signal(struct drm_i915_gem_request *signaller_req,
1412
		       unsigned int num_dwords)
1413
{
1414
	struct intel_engine_cs *signaller = signaller_req->engine;
1415
	struct drm_i915_private *dev_priv = signaller_req->i915;
1416
	struct intel_engine_cs *useless;
1417 1418
	enum intel_engine_id id;
	int ret, num_rings;
1419

1420
#define MBOX_UPDATE_DWORDS 3
1421
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1422 1423
	num_dwords += round_up((num_rings-1) * MBOX_UPDATE_DWORDS, 2);
#undef MBOX_UPDATE_DWORDS
1424

1425
	ret = intel_ring_begin(signaller_req, num_dwords);
1426 1427 1428
	if (ret)
		return ret;

1429 1430
	for_each_engine_id(useless, dev_priv, id) {
		i915_reg_t mbox_reg = signaller->semaphore.mbox.signal[id];
1431 1432

		if (i915_mmio_reg_valid(mbox_reg)) {
1433
			intel_ring_emit(signaller, MI_LOAD_REGISTER_IMM(1));
1434
			intel_ring_emit_reg(signaller, mbox_reg);
1435
			intel_ring_emit(signaller, signaller_req->seqno);
1436 1437
		}
	}
1438

1439 1440 1441 1442
	/* If num_dwords was rounded, make sure the tail pointer is correct */
	if (num_rings % 2 == 0)
		intel_ring_emit(signaller, MI_NOOP);

1443
	return 0;
1444 1445
}

1446 1447
/**
 * gen6_add_request - Update the semaphore mailbox registers
1448 1449
 *
 * @request - request to write to the ring
1450 1451 1452 1453
 *
 * Update the mailbox registers in the *other* rings with the current seqno.
 * This acts like a signal in the canonical semaphore.
 */
1454
static int
1455
gen6_add_request(struct drm_i915_gem_request *req)
1456
{
1457
	struct intel_engine_cs *engine = req->engine;
1458
	int ret;
1459

1460 1461
	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 4);
B
Ben Widawsky 已提交
1462
	else
1463
		ret = intel_ring_begin(req, 4);
B
Ben Widawsky 已提交
1464

1465 1466 1467
	if (ret)
		return ret;

1468 1469 1470
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1471
	intel_ring_emit(engine, req->seqno);
1472 1473
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1474 1475 1476 1477

	return 0;
}

1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
static int
gen8_render_add_request(struct drm_i915_gem_request *req)
{
	struct intel_engine_cs *engine = req->engine;
	int ret;

	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 8);
	else
		ret = intel_ring_begin(req, 8);
	if (ret)
		return ret;

	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, (PIPE_CONTROL_GLOBAL_GTT_IVB |
				 PIPE_CONTROL_CS_STALL |
				 PIPE_CONTROL_QW_WRITE));
	intel_ring_emit(engine, intel_hws_seqno_address(req->engine));
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, i915_gem_request_get_seqno(req));
	/* We're thrashing one dword of HWS. */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	intel_ring_emit(engine, MI_NOOP);
	__intel_ring_advance(engine);

	return 0;
}

1507
static inline bool i915_gem_has_seqno_wrapped(struct drm_i915_private *dev_priv,
1508 1509 1510 1511 1512
					      u32 seqno)
{
	return dev_priv->last_seqno < seqno;
}

1513 1514 1515 1516 1517 1518 1519
/**
 * intel_ring_sync - sync the waiter to the signaller on seqno
 *
 * @waiter - ring that is waiting
 * @signaller - ring which has, or will signal
 * @seqno - seqno which the waiter will block on
 */
1520 1521

static int
1522
gen8_ring_sync(struct drm_i915_gem_request *waiter_req,
1523 1524 1525
	       struct intel_engine_cs *signaller,
	       u32 seqno)
{
1526
	struct intel_engine_cs *waiter = waiter_req->engine;
1527
	struct drm_i915_private *dev_priv = waiter_req->i915;
1528
	u64 offset = GEN8_WAIT_OFFSET(waiter, signaller->id);
1529
	struct i915_hw_ppgtt *ppgtt;
1530 1531
	int ret;

1532
	ret = intel_ring_begin(waiter_req, 4);
1533 1534 1535 1536 1537 1538 1539
	if (ret)
		return ret;

	intel_ring_emit(waiter, MI_SEMAPHORE_WAIT |
				MI_SEMAPHORE_GLOBAL_GTT |
				MI_SEMAPHORE_SAD_GTE_SDD);
	intel_ring_emit(waiter, seqno);
1540 1541
	intel_ring_emit(waiter, lower_32_bits(offset));
	intel_ring_emit(waiter, upper_32_bits(offset));
1542
	intel_ring_advance(waiter);
1543 1544 1545 1546 1547 1548 1549 1550 1551

	/* When the !RCS engines idle waiting upon a semaphore, they lose their
	 * pagetables and we must reload them before executing the batch.
	 * We do this on the i915_switch_context() following the wait and
	 * before the dispatch.
	 */
	ppgtt = waiter_req->ctx->ppgtt;
	if (ppgtt && waiter_req->engine->id != RCS)
		ppgtt->pd_dirty_rings |= intel_engine_flag(waiter_req->engine);
1552 1553 1554
	return 0;
}

1555
static int
1556
gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
1557
	       struct intel_engine_cs *signaller,
1558
	       u32 seqno)
1559
{
1560
	struct intel_engine_cs *waiter = waiter_req->engine;
1561 1562 1563
	u32 dw1 = MI_SEMAPHORE_MBOX |
		  MI_SEMAPHORE_COMPARE |
		  MI_SEMAPHORE_REGISTER;
1564 1565
	u32 wait_mbox = signaller->semaphore.mbox.wait[waiter->id];
	int ret;
1566

1567 1568 1569 1570 1571 1572
	/* Throughout all of the GEM code, seqno passed implies our current
	 * seqno is >= the last seqno executed. However for hardware the
	 * comparison is strictly greater than.
	 */
	seqno -= 1;

1573
	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
1574

1575
	ret = intel_ring_begin(waiter_req, 4);
1576 1577 1578
	if (ret)
		return ret;

1579
	/* If seqno wrap happened, omit the wait with no-ops */
1580
	if (likely(!i915_gem_has_seqno_wrapped(waiter_req->i915, seqno))) {
1581
		intel_ring_emit(waiter, dw1 | wait_mbox);
1582 1583 1584 1585 1586 1587 1588 1589 1590
		intel_ring_emit(waiter, seqno);
		intel_ring_emit(waiter, 0);
		intel_ring_emit(waiter, MI_NOOP);
	} else {
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
	}
1591
	intel_ring_advance(waiter);
1592 1593 1594 1595

	return 0;
}

1596 1597
static void
gen5_seqno_barrier(struct intel_engine_cs *ring)
1598
{
1599 1600 1601
	/* MI_STORE are internally buffered by the GPU and not flushed
	 * either by MI_FLUSH or SyncFlush or any other combination of
	 * MI commands.
1602
	 *
1603 1604 1605 1606 1607 1608 1609
	 * "Only the submission of the store operation is guaranteed.
	 * The write result will be complete (coherent) some time later
	 * (this is practically a finite period but there is no guaranteed
	 * latency)."
	 *
	 * Empirically, we observe that we need a delay of at least 75us to
	 * be sure that the seqno write is visible by the CPU.
1610
	 */
1611
	usleep_range(125, 250);
1612 1613
}

1614 1615
static void
gen6_seqno_barrier(struct intel_engine_cs *engine)
1616
{
1617
	struct drm_i915_private *dev_priv = engine->i915;
1618

1619 1620
	/* Workaround to force correct ordering between irq and seqno writes on
	 * ivb (and maybe also on snb) by reading from a CS register (like
1621 1622 1623 1624 1625 1626 1627 1628 1629
	 * ACTHD) before reading the status page.
	 *
	 * Note that this effectively stalls the read by the time it takes to
	 * do a memory transaction, which more or less ensures that the write
	 * from the GPU has sufficient time to invalidate the CPU cacheline.
	 * Alternatively we could delay the interrupt from the CS ring to give
	 * the write time to land, but that would incur a delay after every
	 * batch i.e. much more frequent than a delay when waiting for the
	 * interrupt (with the same net latency).
1630 1631 1632
	 *
	 * Also note that to prevent whole machine hangs on gen7, we have to
	 * take the spinlock to guard against concurrent cacheline access.
1633
	 */
1634
	spin_lock_irq(&dev_priv->uncore.lock);
1635
	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
1636
	spin_unlock_irq(&dev_priv->uncore.lock);
1637 1638
}

1639
static bool
1640
gen5_ring_get_irq(struct intel_engine_cs *engine)
1641
{
1642
	struct drm_i915_private *dev_priv = engine->i915;
1643
	unsigned long flags;
1644

1645
	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1646 1647
		return false;

1648
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1649 1650
	if (engine->irq_refcount++ == 0)
		gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1651
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1652 1653 1654 1655 1656

	return true;
}

static void
1657
gen5_ring_put_irq(struct intel_engine_cs *engine)
1658
{
1659
	struct drm_i915_private *dev_priv = engine->i915;
1660
	unsigned long flags;
1661

1662
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1663 1664
	if (--engine->irq_refcount == 0)
		gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1665
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1666 1667
}

1668
static bool
1669
i9xx_ring_get_irq(struct intel_engine_cs *engine)
1670
{
1671
	struct drm_i915_private *dev_priv = engine->i915;
1672
	unsigned long flags;
1673

1674
	if (!intel_irqs_enabled(dev_priv))
1675 1676
		return false;

1677
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1678 1679
	if (engine->irq_refcount++ == 0) {
		dev_priv->irq_mask &= ~engine->irq_enable_mask;
1680 1681 1682
		I915_WRITE(IMR, dev_priv->irq_mask);
		POSTING_READ(IMR);
	}
1683
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1684 1685

	return true;
1686 1687
}

1688
static void
1689
i9xx_ring_put_irq(struct intel_engine_cs *engine)
1690
{
1691
	struct drm_i915_private *dev_priv = engine->i915;
1692
	unsigned long flags;
1693

1694
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1695 1696
	if (--engine->irq_refcount == 0) {
		dev_priv->irq_mask |= engine->irq_enable_mask;
1697 1698 1699
		I915_WRITE(IMR, dev_priv->irq_mask);
		POSTING_READ(IMR);
	}
1700
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1701 1702
}

C
Chris Wilson 已提交
1703
static bool
1704
i8xx_ring_get_irq(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1705
{
1706
	struct drm_i915_private *dev_priv = engine->i915;
1707
	unsigned long flags;
C
Chris Wilson 已提交
1708

1709
	if (!intel_irqs_enabled(dev_priv))
C
Chris Wilson 已提交
1710 1711
		return false;

1712
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1713 1714
	if (engine->irq_refcount++ == 0) {
		dev_priv->irq_mask &= ~engine->irq_enable_mask;
C
Chris Wilson 已提交
1715 1716 1717
		I915_WRITE16(IMR, dev_priv->irq_mask);
		POSTING_READ16(IMR);
	}
1718
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
C
Chris Wilson 已提交
1719 1720 1721 1722 1723

	return true;
}

static void
1724
i8xx_ring_put_irq(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1725
{
1726
	struct drm_i915_private *dev_priv = engine->i915;
1727
	unsigned long flags;
C
Chris Wilson 已提交
1728

1729
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1730 1731
	if (--engine->irq_refcount == 0) {
		dev_priv->irq_mask |= engine->irq_enable_mask;
C
Chris Wilson 已提交
1732 1733 1734
		I915_WRITE16(IMR, dev_priv->irq_mask);
		POSTING_READ16(IMR);
	}
1735
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
C
Chris Wilson 已提交
1736 1737
}

1738
static int
1739
bsd_ring_flush(struct drm_i915_gem_request *req,
1740 1741
	       u32     invalidate_domains,
	       u32     flush_domains)
1742
{
1743
	struct intel_engine_cs *engine = req->engine;
1744 1745
	int ret;

1746
	ret = intel_ring_begin(req, 2);
1747 1748 1749
	if (ret)
		return ret;

1750 1751 1752
	intel_ring_emit(engine, MI_FLUSH);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1753
	return 0;
1754 1755
}

1756
static int
1757
i9xx_add_request(struct drm_i915_gem_request *req)
1758
{
1759
	struct intel_engine_cs *engine = req->engine;
1760 1761
	int ret;

1762
	ret = intel_ring_begin(req, 4);
1763 1764
	if (ret)
		return ret;
1765

1766 1767 1768
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1769
	intel_ring_emit(engine, req->seqno);
1770 1771
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1772

1773
	return 0;
1774 1775
}

1776
static bool
1777
gen6_ring_get_irq(struct intel_engine_cs *engine)
1778
{
1779
	struct drm_i915_private *dev_priv = engine->i915;
1780
	unsigned long flags;
1781

1782 1783
	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
		return false;
1784

1785
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1786
	if (engine->irq_refcount++ == 0) {
1787
		if (HAS_L3_DPF(dev_priv) && engine->id == RCS)
1788 1789
			I915_WRITE_IMR(engine,
				       ~(engine->irq_enable_mask |
1790
					 GT_PARITY_ERROR(dev_priv)));
1791
		else
1792 1793
			I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
		gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1794
	}
1795
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1796 1797 1798 1799 1800

	return true;
}

static void
1801
gen6_ring_put_irq(struct intel_engine_cs *engine)
1802
{
1803
	struct drm_i915_private *dev_priv = engine->i915;
1804
	unsigned long flags;
1805

1806
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1807
	if (--engine->irq_refcount == 0) {
1808 1809
		if (HAS_L3_DPF(dev_priv) && engine->id == RCS)
			I915_WRITE_IMR(engine, ~GT_PARITY_ERROR(dev_priv));
1810
		else
1811 1812
			I915_WRITE_IMR(engine, ~0);
		gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1813
	}
1814
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1815 1816
}

B
Ben Widawsky 已提交
1817
static bool
1818
hsw_vebox_get_irq(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1819
{
1820
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1821 1822
	unsigned long flags;

1823
	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
B
Ben Widawsky 已提交
1824 1825
		return false;

1826
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1827 1828 1829
	if (engine->irq_refcount++ == 0) {
		I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
		gen6_enable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1830
	}
1831
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
B
Ben Widawsky 已提交
1832 1833 1834 1835 1836

	return true;
}

static void
1837
hsw_vebox_put_irq(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1838
{
1839
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1840 1841
	unsigned long flags;

1842
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1843 1844 1845
	if (--engine->irq_refcount == 0) {
		I915_WRITE_IMR(engine, ~0);
		gen6_disable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1846
	}
1847
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
B
Ben Widawsky 已提交
1848 1849
}

1850
static bool
1851
gen8_ring_get_irq(struct intel_engine_cs *engine)
1852
{
1853
	struct drm_i915_private *dev_priv = engine->i915;
1854 1855
	unsigned long flags;

1856
	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1857 1858 1859
		return false;

	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1860
	if (engine->irq_refcount++ == 0) {
1861
		if (HAS_L3_DPF(dev_priv) && engine->id == RCS) {
1862 1863
			I915_WRITE_IMR(engine,
				       ~(engine->irq_enable_mask |
1864 1865
					 GT_RENDER_L3_PARITY_ERROR_INTERRUPT));
		} else {
1866
			I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
1867
		}
1868
		POSTING_READ(RING_IMR(engine->mmio_base));
1869 1870 1871 1872 1873 1874 1875
	}
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);

	return true;
}

static void
1876
gen8_ring_put_irq(struct intel_engine_cs *engine)
1877
{
1878
	struct drm_i915_private *dev_priv = engine->i915;
1879 1880 1881
	unsigned long flags;

	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1882
	if (--engine->irq_refcount == 0) {
1883
		if (HAS_L3_DPF(dev_priv) && engine->id == RCS) {
1884
			I915_WRITE_IMR(engine,
1885 1886
				       ~GT_RENDER_L3_PARITY_ERROR_INTERRUPT);
		} else {
1887
			I915_WRITE_IMR(engine, ~0);
1888
		}
1889
		POSTING_READ(RING_IMR(engine->mmio_base));
1890 1891 1892 1893
	}
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
}

1894
static int
1895
i965_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1896
			 u64 offset, u32 length,
1897
			 unsigned dispatch_flags)
1898
{
1899
	struct intel_engine_cs *engine = req->engine;
1900
	int ret;
1901

1902
	ret = intel_ring_begin(req, 2);
1903 1904 1905
	if (ret)
		return ret;

1906
	intel_ring_emit(engine,
1907 1908
			MI_BATCH_BUFFER_START |
			MI_BATCH_GTT |
1909 1910
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
1911 1912
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
1913

1914 1915 1916
	return 0;
}

1917 1918
/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
#define I830_BATCH_LIMIT (256*1024)
1919 1920
#define I830_TLB_ENTRIES (2)
#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1921
static int
1922
i830_dispatch_execbuffer(struct drm_i915_gem_request *req,
1923 1924
			 u64 offset, u32 len,
			 unsigned dispatch_flags)
1925
{
1926
	struct intel_engine_cs *engine = req->engine;
1927
	u32 cs_offset = engine->scratch.gtt_offset;
1928
	int ret;
1929

1930
	ret = intel_ring_begin(req, 6);
1931 1932
	if (ret)
		return ret;
1933

1934
	/* Evict the invalid PTE TLBs */
1935 1936 1937 1938 1939 1940 1941
	intel_ring_emit(engine, COLOR_BLT_CMD | BLT_WRITE_RGBA);
	intel_ring_emit(engine, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
	intel_ring_emit(engine, I830_TLB_ENTRIES << 16 | 4); /* load each page */
	intel_ring_emit(engine, cs_offset);
	intel_ring_emit(engine, 0xdeadbeef);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1942

1943
	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1944 1945 1946
		if (len > I830_BATCH_LIMIT)
			return -ENOSPC;

1947
		ret = intel_ring_begin(req, 6 + 2);
1948 1949
		if (ret)
			return ret;
1950 1951 1952 1953 1954

		/* Blit the batch (which has now all relocs applied) to the
		 * stable batch scratch bo area (so that the CS never
		 * stumbles over its tlb invalidation bug) ...
		 */
1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
		intel_ring_emit(engine, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
		intel_ring_emit(engine,
				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
		intel_ring_emit(engine, DIV_ROUND_UP(len, 4096) << 16 | 4096);
		intel_ring_emit(engine, cs_offset);
		intel_ring_emit(engine, 4096);
		intel_ring_emit(engine, offset);

		intel_ring_emit(engine, MI_FLUSH);
		intel_ring_emit(engine, MI_NOOP);
		intel_ring_advance(engine);
1966 1967

		/* ... and execute it. */
1968
		offset = cs_offset;
1969
	}
1970

1971
	ret = intel_ring_begin(req, 2);
1972 1973 1974
	if (ret)
		return ret;

1975 1976 1977 1978
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1979

1980 1981 1982 1983
	return 0;
}

static int
1984
i915_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1985
			 u64 offset, u32 len,
1986
			 unsigned dispatch_flags)
1987
{
1988
	struct intel_engine_cs *engine = req->engine;
1989 1990
	int ret;

1991
	ret = intel_ring_begin(req, 2);
1992 1993 1994
	if (ret)
		return ret;

1995 1996 1997 1998
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1999 2000 2001 2002

	return 0;
}

2003
static void cleanup_phys_status_page(struct intel_engine_cs *engine)
2004
{
2005
	struct drm_i915_private *dev_priv = engine->i915;
2006 2007 2008 2009

	if (!dev_priv->status_page_dmah)
		return;

2010
	drm_pci_free(dev_priv->dev, dev_priv->status_page_dmah);
2011
	engine->status_page.page_addr = NULL;
2012 2013
}

2014
static void cleanup_status_page(struct intel_engine_cs *engine)
2015
{
2016
	struct drm_i915_gem_object *obj;
2017

2018
	obj = engine->status_page.obj;
2019
	if (obj == NULL)
2020 2021
		return;

2022
	kunmap(sg_page(obj->pages->sgl));
B
Ben Widawsky 已提交
2023
	i915_gem_object_ggtt_unpin(obj);
2024
	drm_gem_object_unreference(&obj->base);
2025
	engine->status_page.obj = NULL;
2026 2027
}

2028
static int init_status_page(struct intel_engine_cs *engine)
2029
{
2030
	struct drm_i915_gem_object *obj = engine->status_page.obj;
2031

2032
	if (obj == NULL) {
2033
		unsigned flags;
2034
		int ret;
2035

2036
		obj = i915_gem_object_create(engine->i915->dev, 4096);
2037
		if (IS_ERR(obj)) {
2038
			DRM_ERROR("Failed to allocate status page\n");
2039
			return PTR_ERR(obj);
2040
		}
2041

2042 2043 2044 2045
		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
		if (ret)
			goto err_unref;

2046
		flags = 0;
2047
		if (!HAS_LLC(engine->i915))
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059
			/* On g33, we cannot place HWS above 256MiB, so
			 * restrict its pinning to the low mappable arena.
			 * Though this restriction is not documented for
			 * gen4, gen5, or byt, they also behave similarly
			 * and hang if the HWS is placed at the top of the
			 * GTT. To generalise, it appears that all !llc
			 * platforms have issues with us placing the HWS
			 * above the mappable region (even though we never
			 * actualy map it).
			 */
			flags |= PIN_MAPPABLE;
		ret = i915_gem_obj_ggtt_pin(obj, 4096, flags);
2060 2061 2062 2063 2064 2065
		if (ret) {
err_unref:
			drm_gem_object_unreference(&obj->base);
			return ret;
		}

2066
		engine->status_page.obj = obj;
2067
	}
2068

2069 2070 2071
	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(obj);
	engine->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
2072

2073
	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
2074
			engine->name, engine->status_page.gfx_addr);
2075 2076 2077 2078

	return 0;
}

2079
static int init_phys_status_page(struct intel_engine_cs *engine)
2080
{
2081
	struct drm_i915_private *dev_priv = engine->i915;
2082 2083 2084

	if (!dev_priv->status_page_dmah) {
		dev_priv->status_page_dmah =
2085
			drm_pci_alloc(dev_priv->dev, PAGE_SIZE, PAGE_SIZE);
2086 2087 2088 2089
		if (!dev_priv->status_page_dmah)
			return -ENOMEM;
	}

2090 2091
	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
2092 2093 2094 2095

	return 0;
}

2096
void intel_unpin_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2097
{
2098 2099 2100
	GEM_BUG_ON(ringbuf->vma == NULL);
	GEM_BUG_ON(ringbuf->virtual_start == NULL);

2101
	if (HAS_LLC(ringbuf->obj->base.dev) && !ringbuf->obj->stolen)
2102
		i915_gem_object_unpin_map(ringbuf->obj);
2103
	else
2104
		i915_vma_unpin_iomap(ringbuf->vma);
2105
	ringbuf->virtual_start = NULL;
2106

2107
	i915_gem_object_ggtt_unpin(ringbuf->obj);
2108
	ringbuf->vma = NULL;
2109 2110
}

2111
int intel_pin_and_map_ringbuffer_obj(struct drm_i915_private *dev_priv,
2112 2113 2114
				     struct intel_ringbuffer *ringbuf)
{
	struct drm_i915_gem_object *obj = ringbuf->obj;
2115 2116
	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
	unsigned flags = PIN_OFFSET_BIAS | 4096;
2117
	void *addr;
2118 2119
	int ret;

2120
	if (HAS_LLC(dev_priv) && !obj->stolen) {
2121
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE, flags);
2122 2123
		if (ret)
			return ret;
2124

2125
		ret = i915_gem_object_set_to_cpu_domain(obj, true);
2126 2127
		if (ret)
			goto err_unpin;
2128

2129 2130 2131
		addr = i915_gem_object_pin_map(obj);
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2132
			goto err_unpin;
2133 2134
		}
	} else {
2135 2136
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE,
					    flags | PIN_MAPPABLE);
2137 2138
		if (ret)
			return ret;
2139

2140
		ret = i915_gem_object_set_to_gtt_domain(obj, true);
2141 2142
		if (ret)
			goto err_unpin;
2143

2144 2145 2146
		/* Access through the GTT requires the device to be awake. */
		assert_rpm_wakelock_held(dev_priv);

2147 2148 2149
		addr = i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2150
			goto err_unpin;
2151
		}
2152 2153
	}

2154
	ringbuf->virtual_start = addr;
2155
	ringbuf->vma = i915_gem_obj_to_ggtt(obj);
2156
	return 0;
2157 2158 2159 2160

err_unpin:
	i915_gem_object_ggtt_unpin(obj);
	return ret;
2161 2162
}

2163
static void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2164
{
2165 2166 2167 2168
	drm_gem_object_unreference(&ringbuf->obj->base);
	ringbuf->obj = NULL;
}

2169 2170
static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
				      struct intel_ringbuffer *ringbuf)
2171
{
2172
	struct drm_i915_gem_object *obj;
2173

2174 2175
	obj = NULL;
	if (!HAS_LLC(dev))
2176
		obj = i915_gem_object_create_stolen(dev, ringbuf->size);
2177
	if (obj == NULL)
2178
		obj = i915_gem_object_create(dev, ringbuf->size);
2179 2180
	if (IS_ERR(obj))
		return PTR_ERR(obj);
2181

2182 2183 2184
	/* mark ring buffers as read-only from GPU side by default */
	obj->gt_ro = 1;

2185
	ringbuf->obj = obj;
2186

2187
	return 0;
2188 2189
}

2190 2191 2192 2193 2194 2195 2196
struct intel_ringbuffer *
intel_engine_create_ringbuffer(struct intel_engine_cs *engine, int size)
{
	struct intel_ringbuffer *ring;
	int ret;

	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
2197 2198 2199
	if (ring == NULL) {
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n",
				 engine->name);
2200
		return ERR_PTR(-ENOMEM);
2201
	}
2202

2203
	ring->engine = engine;
2204
	list_add(&ring->link, &engine->buffers);
2205 2206 2207 2208 2209 2210 2211

	ring->size = size;
	/* Workaround an erratum on the i830 which causes a hang if
	 * the TAIL pointer points to within the last 2 cachelines
	 * of the buffer.
	 */
	ring->effective_size = size;
2212
	if (IS_I830(engine->i915) || IS_845G(engine->i915))
2213 2214 2215 2216 2217
		ring->effective_size -= 2 * CACHELINE_BYTES;

	ring->last_retired_head = -1;
	intel_ring_update_space(ring);

2218
	ret = intel_alloc_ringbuffer_obj(engine->i915->dev, ring);
2219
	if (ret) {
2220 2221 2222
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s: %d\n",
				 engine->name, ret);
		list_del(&ring->link);
2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233
		kfree(ring);
		return ERR_PTR(ret);
	}

	return ring;
}

void
intel_ringbuffer_free(struct intel_ringbuffer *ring)
{
	intel_destroy_ringbuffer_obj(ring);
2234
	list_del(&ring->link);
2235 2236 2237
	kfree(ring);
}

2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254
static int intel_ring_context_pin(struct i915_gem_context *ctx,
				  struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];
	int ret;

	lockdep_assert_held(&ctx->i915->dev->struct_mutex);

	if (ce->pin_count++)
		return 0;

	if (ce->state) {
		ret = i915_gem_obj_ggtt_pin(ce->state, ctx->ggtt_alignment, 0);
		if (ret)
			goto error;
	}

2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	/* The kernel context is only used as a placeholder for flushing the
	 * active context. It is never used for submitting user rendering and
	 * as such never requires the golden render context, and so we can skip
	 * emitting it when we switch to the kernel context. This is required
	 * as during eviction we cannot allocate and pin the renderstate in
	 * order to initialise the context.
	 */
	if (ctx == ctx->i915->kernel_context)
		ce->initialised = true;

2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
	i915_gem_context_reference(ctx);
	return 0;

error:
	ce->pin_count = 0;
	return ret;
}

static void intel_ring_context_unpin(struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];

	lockdep_assert_held(&ctx->i915->dev->struct_mutex);

	if (--ce->pin_count)
		return;

	if (ce->state)
		i915_gem_object_ggtt_unpin(ce->state);

	i915_gem_context_unreference(ctx);
}

2289
static int intel_init_ring_buffer(struct drm_device *dev,
2290
				  struct intel_engine_cs *engine)
2291
{
2292
	struct drm_i915_private *dev_priv = to_i915(dev);
2293
	struct intel_ringbuffer *ringbuf;
2294 2295
	int ret;

2296
	WARN_ON(engine->buffer);
2297

2298
	engine->i915 = dev_priv;
2299 2300 2301 2302 2303 2304 2305
	INIT_LIST_HEAD(&engine->active_list);
	INIT_LIST_HEAD(&engine->request_list);
	INIT_LIST_HEAD(&engine->execlist_queue);
	INIT_LIST_HEAD(&engine->buffers);
	i915_gem_batch_pool_init(dev, &engine->batch_pool);
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2306

2307 2308 2309
	ret = intel_engine_init_breadcrumbs(engine);
	if (ret)
		goto error;
2310

2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321
	/* We may need to do things with the shrinker which
	 * require us to immediately switch back to the default
	 * context. This can cause a problem as pinning the
	 * default context also requires GTT space which may not
	 * be available. To avoid this we always pin the default
	 * context.
	 */
	ret = intel_ring_context_pin(dev_priv->kernel_context, engine);
	if (ret)
		goto error;

2322
	ringbuf = intel_engine_create_ringbuffer(engine, 32 * PAGE_SIZE);
2323 2324 2325 2326
	if (IS_ERR(ringbuf)) {
		ret = PTR_ERR(ringbuf);
		goto error;
	}
2327
	engine->buffer = ringbuf;
2328

2329
	if (I915_NEED_GFX_HWS(dev_priv)) {
2330
		ret = init_status_page(engine);
2331
		if (ret)
2332
			goto error;
2333
	} else {
2334 2335
		WARN_ON(engine->id != RCS);
		ret = init_phys_status_page(engine);
2336
		if (ret)
2337
			goto error;
2338 2339
	}

2340
	ret = intel_pin_and_map_ringbuffer_obj(dev_priv, ringbuf);
2341 2342
	if (ret) {
		DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
2343
				engine->name, ret);
2344 2345
		intel_destroy_ringbuffer_obj(ringbuf);
		goto error;
2346
	}
2347

2348
	ret = i915_cmd_parser_init_ring(engine);
2349
	if (ret)
2350 2351 2352
		goto error;

	return 0;
2353

2354
error:
2355
	intel_cleanup_engine(engine);
2356
	return ret;
2357 2358
}

2359
void intel_cleanup_engine(struct intel_engine_cs *engine)
2360
{
2361
	struct drm_i915_private *dev_priv;
2362

2363
	if (!intel_engine_initialized(engine))
2364 2365
		return;

2366
	dev_priv = engine->i915;
2367

2368
	if (engine->buffer) {
2369
		intel_stop_engine(engine);
2370
		WARN_ON(!IS_GEN2(dev_priv) && (I915_READ_MODE(engine) & MODE_IDLE) == 0);
2371

2372 2373 2374
		intel_unpin_ringbuffer_obj(engine->buffer);
		intel_ringbuffer_free(engine->buffer);
		engine->buffer = NULL;
2375
	}
2376

2377 2378
	if (engine->cleanup)
		engine->cleanup(engine);
Z
Zou Nan hai 已提交
2379

2380
	if (I915_NEED_GFX_HWS(dev_priv)) {
2381
		cleanup_status_page(engine);
2382
	} else {
2383 2384
		WARN_ON(engine->id != RCS);
		cleanup_phys_status_page(engine);
2385
	}
2386

2387 2388
	i915_cmd_parser_fini_ring(engine);
	i915_gem_batch_pool_fini(&engine->batch_pool);
2389
	intel_engine_fini_breadcrumbs(engine);
2390 2391 2392

	intel_ring_context_unpin(dev_priv->kernel_context, engine);

2393
	engine->i915 = NULL;
2394 2395
}

2396
int intel_engine_idle(struct intel_engine_cs *engine)
2397
{
2398
	struct drm_i915_gem_request *req;
2399 2400

	/* Wait upon the last request to be completed */
2401
	if (list_empty(&engine->request_list))
2402 2403
		return 0;

2404 2405 2406
	req = list_entry(engine->request_list.prev,
			 struct drm_i915_gem_request,
			 list);
2407 2408 2409

	/* Make sure we do not trigger any retires */
	return __i915_wait_request(req,
2410
				   req->i915->mm.interruptible,
2411
				   NULL, NULL);
2412 2413
}

2414
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
2415
{
2416 2417 2418 2419 2420 2421
	int ret;

	/* Flush enough space to reduce the likelihood of waiting after
	 * we start building the request - in which case we will just
	 * have to repeat work.
	 */
2422
	request->reserved_space += LEGACY_REQUEST_SIZE;
2423

2424
	request->ringbuf = request->engine->buffer;
2425 2426 2427 2428 2429

	ret = intel_ring_begin(request, 0);
	if (ret)
		return ret;

2430
	request->reserved_space -= LEGACY_REQUEST_SIZE;
2431
	return 0;
2432 2433
}

2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452
static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
{
	struct intel_ringbuffer *ringbuf = req->ringbuf;
	struct intel_engine_cs *engine = req->engine;
	struct drm_i915_gem_request *target;

	intel_ring_update_space(ringbuf);
	if (ringbuf->space >= bytes)
		return 0;

	/*
	 * Space is reserved in the ringbuffer for finalising the request,
	 * as that cannot be allowed to fail. During request finalisation,
	 * reserved_space is set to 0 to stop the overallocation and the
	 * assumption is that then we never need to wait (which has the
	 * risk of failing with EINTR).
	 *
	 * See also i915_gem_request_alloc() and i915_add_request().
	 */
2453
	GEM_BUG_ON(!req->reserved_space);
2454 2455 2456 2457

	list_for_each_entry(target, &engine->request_list, list) {
		unsigned space;

2458
		/*
2459 2460 2461
		 * The request queue is per-engine, so can contain requests
		 * from multiple ringbuffers. Here, we must ignore any that
		 * aren't from the ringbuffer we're considering.
2462
		 */
2463 2464 2465 2466 2467 2468 2469 2470
		if (target->ringbuf != ringbuf)
			continue;

		/* Would completion of this request free enough space? */
		space = __intel_ring_space(target->postfix, ringbuf->tail,
					   ringbuf->size);
		if (space >= bytes)
			break;
2471
	}
2472

2473 2474 2475 2476
	if (WARN_ON(&target->list == &engine->request_list))
		return -ENOSPC;

	return i915_wait_request(target);
2477 2478
}

2479
int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
M
Mika Kuoppala 已提交
2480
{
2481
	struct intel_ringbuffer *ringbuf = req->ringbuf;
2482
	int remain_actual = ringbuf->size - ringbuf->tail;
2483 2484 2485
	int remain_usable = ringbuf->effective_size - ringbuf->tail;
	int bytes = num_dwords * sizeof(u32);
	int total_bytes, wait_bytes;
2486
	bool need_wrap = false;
2487

2488
	total_bytes = bytes + req->reserved_space;
2489

2490 2491 2492 2493 2494 2495 2496
	if (unlikely(bytes > remain_usable)) {
		/*
		 * Not enough space for the basic request. So need to flush
		 * out the remainder and then wait for base + reserved.
		 */
		wait_bytes = remain_actual + total_bytes;
		need_wrap = true;
2497 2498 2499 2500 2501 2502 2503
	} else if (unlikely(total_bytes > remain_usable)) {
		/*
		 * The base request will fit but the reserved space
		 * falls off the end. So we don't need an immediate wrap
		 * and only need to effectively wait for the reserved
		 * size space from the start of ringbuffer.
		 */
2504
		wait_bytes = remain_actual + req->reserved_space;
2505
	} else {
2506 2507
		/* No wrapping required, just waiting. */
		wait_bytes = total_bytes;
M
Mika Kuoppala 已提交
2508 2509
	}

2510 2511
	if (wait_bytes > ringbuf->space) {
		int ret = wait_for_space(req, wait_bytes);
M
Mika Kuoppala 已提交
2512 2513
		if (unlikely(ret))
			return ret;
2514

2515
		intel_ring_update_space(ringbuf);
2516 2517
		if (unlikely(ringbuf->space < wait_bytes))
			return -EAGAIN;
M
Mika Kuoppala 已提交
2518 2519
	}

2520 2521 2522
	if (unlikely(need_wrap)) {
		GEM_BUG_ON(remain_actual > ringbuf->space);
		GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
2523

2524 2525 2526 2527 2528 2529
		/* Fill the tail with MI_NOOP */
		memset(ringbuf->virtual_start + ringbuf->tail,
		       0, remain_actual);
		ringbuf->tail = 0;
		ringbuf->space -= remain_actual;
	}
2530

2531 2532
	ringbuf->space -= bytes;
	GEM_BUG_ON(ringbuf->space < 0);
2533
	return 0;
2534
}
2535

2536
/* Align the ring tail to a cacheline boundary */
2537
int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
2538
{
2539
	struct intel_engine_cs *engine = req->engine;
2540
	int num_dwords = (engine->buffer->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
2541 2542 2543 2544 2545
	int ret;

	if (num_dwords == 0)
		return 0;

2546
	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
2547
	ret = intel_ring_begin(req, num_dwords);
2548 2549 2550 2551
	if (ret)
		return ret;

	while (num_dwords--)
2552
		intel_ring_emit(engine, MI_NOOP);
2553

2554
	intel_ring_advance(engine);
2555 2556 2557 2558

	return 0;
}

2559
void intel_ring_init_seqno(struct intel_engine_cs *engine, u32 seqno)
2560
{
2561
	struct drm_i915_private *dev_priv = engine->i915;
2562

2563 2564 2565 2566 2567 2568 2569 2570
	/* Our semaphore implementation is strictly monotonic (i.e. we proceed
	 * so long as the semaphore value in the register/page is greater
	 * than the sync value), so whenever we reset the seqno,
	 * so long as we reset the tracking semaphore value to 0, it will
	 * always be before the next request's seqno. If we don't reset
	 * the semaphore value, then when the seqno moves backwards all
	 * future waits will complete instantly (causing rendering corruption).
	 */
2571
	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
2572 2573
		I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
		I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
2574
		if (HAS_VEBOX(dev_priv))
2575
			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
2576
	}
2577 2578 2579 2580 2581 2582 2583 2584
	if (dev_priv->semaphore_obj) {
		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
		void *semaphores = kmap(page);
		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
		kunmap(page);
	}
2585 2586
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2587

2588 2589 2590
	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
	if (engine->irq_seqno_barrier)
		engine->irq_seqno_barrier(engine);
2591
	engine->last_submitted_seqno = seqno;
2592

2593
	engine->hangcheck.seqno = seqno;
2594 2595 2596 2597 2598 2599 2600

	/* After manually advancing the seqno, fake the interrupt in case
	 * there are any waiters for that seqno.
	 */
	rcu_read_lock();
	intel_engine_wakeup(engine);
	rcu_read_unlock();
2601
}
2602

2603
static void gen6_bsd_ring_write_tail(struct intel_engine_cs *engine,
2604
				     u32 value)
2605
{
2606
	struct drm_i915_private *dev_priv = engine->i915;
2607

2608 2609
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);

2610
       /* Every tail move must follow the sequence below */
2611 2612 2613 2614

	/* Disable notification that the ring is IDLE. The GT
	 * will then assume that it is busy and bring it out of rc6.
	 */
2615 2616
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2617 2618

	/* Clear the context id. Here be magic! */
2619
	I915_WRITE64_FW(GEN6_BSD_RNCID, 0x0);
2620

2621
	/* Wait for the ring not to be idle, i.e. for it to wake up. */
2622 2623 2624 2625 2626
	if (intel_wait_for_register_fw(dev_priv,
				       GEN6_BSD_SLEEP_PSMI_CONTROL,
				       GEN6_BSD_SLEEP_INDICATOR,
				       0,
				       50))
2627
		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
2628

2629
	/* Now that the ring is fully powered up, update the tail */
2630 2631
	I915_WRITE_FW(RING_TAIL(engine->mmio_base), value);
	POSTING_READ_FW(RING_TAIL(engine->mmio_base));
2632 2633 2634 2635

	/* Let the ring send IDLE messages to the GT again,
	 * and so let it sleep to conserve power when idle.
	 */
2636 2637 2638 2639
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));

	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
2640 2641
}

2642
static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req,
2643
			       u32 invalidate, u32 flush)
2644
{
2645
	struct intel_engine_cs *engine = req->engine;
2646
	uint32_t cmd;
2647 2648
	int ret;

2649
	ret = intel_ring_begin(req, 4);
2650 2651 2652
	if (ret)
		return ret;

2653
	cmd = MI_FLUSH_DW;
2654
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2655
		cmd += 1;
2656 2657 2658 2659 2660 2661 2662 2663

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2664 2665 2666 2667 2668 2669
	/*
	 * Bspec vol 1c.5 - video engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2670
	if (invalidate & I915_GEM_GPU_DOMAINS)
2671 2672
		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;

2673 2674 2675
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2676
	if (INTEL_GEN(req->i915) >= 8) {
2677 2678
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2679
	} else  {
2680 2681
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2682
	}
2683
	intel_ring_advance(engine);
2684
	return 0;
2685 2686
}

2687
static int
2688
gen8_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2689
			      u64 offset, u32 len,
2690
			      unsigned dispatch_flags)
2691
{
2692
	struct intel_engine_cs *engine = req->engine;
2693
	bool ppgtt = USES_PPGTT(engine->dev) &&
2694
			!(dispatch_flags & I915_DISPATCH_SECURE);
2695 2696
	int ret;

2697
	ret = intel_ring_begin(req, 4);
2698 2699 2700 2701
	if (ret)
		return ret;

	/* FIXME(BDW): Address space and security selectors. */
2702
	intel_ring_emit(engine, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
2703 2704
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2705 2706 2707 2708
	intel_ring_emit(engine, lower_32_bits(offset));
	intel_ring_emit(engine, upper_32_bits(offset));
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
2709 2710 2711 2712

	return 0;
}

2713
static int
2714
hsw_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2715 2716
			     u64 offset, u32 len,
			     unsigned dispatch_flags)
2717
{
2718
	struct intel_engine_cs *engine = req->engine;
2719 2720
	int ret;

2721
	ret = intel_ring_begin(req, 2);
2722 2723 2724
	if (ret)
		return ret;

2725
	intel_ring_emit(engine,
2726
			MI_BATCH_BUFFER_START |
2727
			(dispatch_flags & I915_DISPATCH_SECURE ?
2728 2729 2730
			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2731
	/* bit0-7 is the length on GEN6+ */
2732 2733
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2734 2735 2736 2737

	return 0;
}

2738
static int
2739
gen6_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2740
			      u64 offset, u32 len,
2741
			      unsigned dispatch_flags)
2742
{
2743
	struct intel_engine_cs *engine = req->engine;
2744
	int ret;
2745

2746
	ret = intel_ring_begin(req, 2);
2747 2748
	if (ret)
		return ret;
2749

2750
	intel_ring_emit(engine,
2751
			MI_BATCH_BUFFER_START |
2752 2753
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
2754
	/* bit0-7 is the length on GEN6+ */
2755 2756
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2757

2758
	return 0;
2759 2760
}

2761 2762
/* Blitter support (SandyBridge+) */

2763
static int gen6_ring_flush(struct drm_i915_gem_request *req,
2764
			   u32 invalidate, u32 flush)
Z
Zou Nan hai 已提交
2765
{
2766
	struct intel_engine_cs *engine = req->engine;
2767
	uint32_t cmd;
2768 2769
	int ret;

2770
	ret = intel_ring_begin(req, 4);
2771 2772 2773
	if (ret)
		return ret;

2774
	cmd = MI_FLUSH_DW;
2775
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2776
		cmd += 1;
2777 2778 2779 2780 2781 2782 2783 2784

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2785 2786 2787 2788 2789 2790
	/*
	 * Bspec vol 1c.3 - blitter engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2791
	if (invalidate & I915_GEM_DOMAIN_RENDER)
2792
		cmd |= MI_INVALIDATE_TLB;
2793 2794 2795
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2796
	if (INTEL_GEN(req->i915) >= 8) {
2797 2798
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2799
	} else  {
2800 2801
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2802
	}
2803
	intel_ring_advance(engine);
R
Rodrigo Vivi 已提交
2804

2805
	return 0;
Z
Zou Nan hai 已提交
2806 2807
}

2808 2809 2810
static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
				       struct intel_engine_cs *engine)
{
2811
	struct drm_i915_gem_object *obj;
2812
	int ret, i;
2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834

	if (!i915_semaphore_is_enabled(dev_priv))
		return;

	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore_obj) {
		obj = i915_gem_object_create(dev_priv->dev, 4096);
		if (IS_ERR(obj)) {
			DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
			i915.semaphores = 0;
		} else {
			i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
			ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
			if (ret != 0) {
				drm_gem_object_unreference(&obj->base);
				DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
				i915.semaphores = 0;
			} else {
				dev_priv->semaphore_obj = obj;
			}
		}
	}

2835 2836 2837 2838
	if (!i915_semaphore_is_enabled(dev_priv))
		return;

	if (INTEL_GEN(dev_priv) >= 8) {
2839 2840
		u64 offset = i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj);

2841 2842
		engine->semaphore.sync_to = gen8_ring_sync;
		engine->semaphore.signal = gen8_xcs_signal;
2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853

		for (i = 0; i < I915_NUM_ENGINES; i++) {
			u64 ring_offset;

			if (i != engine->id)
				ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i);
			else
				ring_offset = MI_SEMAPHORE_SYNC_INVALID;

			engine->semaphore.signal_ggtt[i] = ring_offset;
		}
2854 2855 2856
	} else if (INTEL_GEN(dev_priv) >= 6) {
		engine->semaphore.sync_to = gen6_ring_sync;
		engine->semaphore.signal = gen6_signal;
2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904

		/*
		 * The current semaphore is only applied on pre-gen8
		 * platform.  And there is no VCS2 ring on the pre-gen8
		 * platform. So the semaphore between RCS and VCS2 is
		 * initialized as INVALID.  Gen8 will initialize the
		 * sema between VCS2 and RCS later.
		 */
		for (i = 0; i < I915_NUM_ENGINES; i++) {
			static const struct {
				u32 wait_mbox;
				i915_reg_t mbox_reg;
			} sem_data[I915_NUM_ENGINES][I915_NUM_ENGINES] = {
				[RCS] = {
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
				},
				[VCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
				},
				[BCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
				},
				[VECS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
				},
			};
			u32 wait_mbox;
			i915_reg_t mbox_reg;

			if (i == engine->id || i == VCS2) {
				wait_mbox = MI_SEMAPHORE_SYNC_INVALID;
				mbox_reg = GEN6_NOSYNC;
			} else {
				wait_mbox = sem_data[engine->id][i].wait_mbox;
				mbox_reg = sem_data[engine->id][i].mbox_reg;
			}

			engine->semaphore.mbox.wait[i] = wait_mbox;
			engine->semaphore.mbox.signal[i] = mbox_reg;
		}
2905 2906 2907
	}
}

2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921
static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
				struct intel_engine_cs *engine)
{
	if (INTEL_GEN(dev_priv) >= 8) {
		engine->irq_get = gen8_ring_get_irq;
		engine->irq_put = gen8_ring_put_irq;
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 6) {
		engine->irq_get = gen6_ring_get_irq;
		engine->irq_put = gen6_ring_put_irq;
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 5) {
		engine->irq_get = gen5_ring_get_irq;
		engine->irq_put = gen5_ring_put_irq;
2922
		engine->irq_seqno_barrier = gen5_seqno_barrier;
2923 2924 2925 2926 2927 2928 2929 2930 2931
	} else if (INTEL_GEN(dev_priv) >= 3) {
		engine->irq_get = i9xx_ring_get_irq;
		engine->irq_put = i9xx_ring_put_irq;
	} else {
		engine->irq_get = i8xx_ring_get_irq;
		engine->irq_put = i8xx_ring_put_irq;
	}
}

2932 2933 2934
static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
				      struct intel_engine_cs *engine)
{
2935
	engine->init_hw = init_ring_common;
2936
	engine->write_tail = ring_write_tail;
2937

2938 2939
	engine->add_request = i9xx_add_request;
	if (INTEL_GEN(dev_priv) >= 6)
2940
		engine->add_request = gen6_add_request;
2941 2942 2943 2944

	if (INTEL_GEN(dev_priv) >= 8)
		engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
	else if (INTEL_GEN(dev_priv) >= 6)
2945
		engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
2946
	else if (INTEL_GEN(dev_priv) >= 4)
2947
		engine->dispatch_execbuffer = i965_dispatch_execbuffer;
2948 2949 2950 2951
	else if (IS_I830(dev_priv) || IS_845G(dev_priv))
		engine->dispatch_execbuffer = i830_dispatch_execbuffer;
	else
		engine->dispatch_execbuffer = i915_dispatch_execbuffer;
2952

2953
	intel_ring_init_irq(dev_priv, engine);
2954
	intel_ring_init_semaphores(dev_priv, engine);
2955 2956
}

2957 2958
int intel_init_render_ring_buffer(struct drm_device *dev)
{
2959
	struct drm_i915_private *dev_priv = dev->dev_private;
2960
	struct intel_engine_cs *engine = &dev_priv->engine[RCS];
2961
	int ret;
2962

2963 2964 2965
	engine->name = "render ring";
	engine->id = RCS;
	engine->exec_id = I915_EXEC_RENDER;
2966
	engine->hw_id = 0;
2967
	engine->mmio_base = RENDER_RING_BASE;
2968

2969 2970
	intel_ring_default_vfuncs(dev_priv, engine);

2971 2972
	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;

2973
	if (INTEL_GEN(dev_priv) >= 8) {
2974
		engine->init_context = intel_rcs_ctx_init;
2975
		engine->add_request = gen8_render_add_request;
2976
		engine->flush = gen8_render_ring_flush;
2977
		if (i915_semaphore_is_enabled(dev_priv))
2978
			engine->semaphore.signal = gen8_rcs_signal;
2979
	} else if (INTEL_GEN(dev_priv) >= 6) {
2980 2981
		engine->init_context = intel_rcs_ctx_init;
		engine->flush = gen7_render_ring_flush;
2982
		if (IS_GEN6(dev_priv))
2983
			engine->flush = gen6_render_ring_flush;
2984
	} else if (IS_GEN5(dev_priv)) {
2985
		engine->flush = gen4_render_ring_flush;
2986
	} else {
2987
		if (INTEL_GEN(dev_priv) < 4)
2988
			engine->flush = gen2_render_ring_flush;
2989
		else
2990 2991
			engine->flush = gen4_render_ring_flush;
		engine->irq_enable_mask = I915_USER_INTERRUPT;
2992
	}
B
Ben Widawsky 已提交
2993

2994
	if (IS_HASWELL(dev_priv))
2995
		engine->dispatch_execbuffer = hsw_ring_dispatch_execbuffer;
2996

2997 2998
	engine->init_hw = init_render_ring;
	engine->cleanup = render_ring_cleanup;
2999

3000
	ret = intel_init_ring_buffer(dev, engine);
3001 3002 3003
	if (ret)
		return ret;

3004
	if (INTEL_GEN(dev_priv) >= 6) {
3005 3006 3007 3008 3009
		ret = intel_init_pipe_control(engine, 4096);
		if (ret)
			return ret;
	} else if (HAS_BROKEN_CS_TLB(dev_priv)) {
		ret = intel_init_pipe_control(engine, I830_WA_SIZE);
3010 3011 3012 3013 3014
		if (ret)
			return ret;
	}

	return 0;
3015 3016 3017 3018
}

int intel_init_bsd_ring_buffer(struct drm_device *dev)
{
3019
	struct drm_i915_private *dev_priv = dev->dev_private;
3020
	struct intel_engine_cs *engine = &dev_priv->engine[VCS];
3021

3022 3023 3024
	engine->name = "bsd ring";
	engine->id = VCS;
	engine->exec_id = I915_EXEC_BSD;
3025
	engine->hw_id = 1;
3026

3027 3028
	intel_ring_default_vfuncs(dev_priv, engine);

3029
	if (INTEL_GEN(dev_priv) >= 6) {
3030
		engine->mmio_base = GEN6_BSD_RING_BASE;
3031
		/* gen6 bsd needs a special wa for tail updates */
3032
		if (IS_GEN6(dev_priv))
3033 3034
			engine->write_tail = gen6_bsd_ring_write_tail;
		engine->flush = gen6_bsd_ring_flush;
3035
		if (INTEL_GEN(dev_priv) >= 8)
3036
			engine->irq_enable_mask =
3037
				GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
3038
		else
3039
			engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
3040
	} else {
3041 3042
		engine->mmio_base = BSD_RING_BASE;
		engine->flush = bsd_ring_flush;
3043
		if (IS_GEN5(dev_priv))
3044
			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
3045
		else
3046
			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
3047 3048
	}

3049
	return intel_init_ring_buffer(dev, engine);
3050
}
3051

3052
/**
3053
 * Initialize the second BSD ring (eg. Broadwell GT3, Skylake GT3)
3054 3055 3056 3057
 */
int intel_init_bsd2_ring_buffer(struct drm_device *dev)
{
	struct drm_i915_private *dev_priv = dev->dev_private;
3058
	struct intel_engine_cs *engine = &dev_priv->engine[VCS2];
3059 3060 3061 3062

	engine->name = "bsd2 ring";
	engine->id = VCS2;
	engine->exec_id = I915_EXEC_BSD;
3063
	engine->hw_id = 4;
3064
	engine->mmio_base = GEN8_BSD2_RING_BASE;
3065 3066 3067

	intel_ring_default_vfuncs(dev_priv, engine);

3068 3069
	engine->flush = gen6_bsd_ring_flush;
	engine->irq_enable_mask =
3070 3071
			GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;

3072
	return intel_init_ring_buffer(dev, engine);
3073 3074
}

3075 3076
int intel_init_blt_ring_buffer(struct drm_device *dev)
{
3077
	struct drm_i915_private *dev_priv = dev->dev_private;
3078
	struct intel_engine_cs *engine = &dev_priv->engine[BCS];
3079 3080 3081 3082

	engine->name = "blitter ring";
	engine->id = BCS;
	engine->exec_id = I915_EXEC_BLT;
3083
	engine->hw_id = 2;
3084
	engine->mmio_base = BLT_RING_BASE;
3085 3086 3087

	intel_ring_default_vfuncs(dev_priv, engine);

3088
	engine->flush = gen6_ring_flush;
3089
	if (INTEL_GEN(dev_priv) >= 8)
3090
		engine->irq_enable_mask =
3091
			GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
3092
	else
3093
		engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
3094

3095
	return intel_init_ring_buffer(dev, engine);
3096
}
3097

B
Ben Widawsky 已提交
3098 3099
int intel_init_vebox_ring_buffer(struct drm_device *dev)
{
3100
	struct drm_i915_private *dev_priv = dev->dev_private;
3101
	struct intel_engine_cs *engine = &dev_priv->engine[VECS];
B
Ben Widawsky 已提交
3102

3103 3104 3105
	engine->name = "video enhancement ring";
	engine->id = VECS;
	engine->exec_id = I915_EXEC_VEBOX;
3106
	engine->hw_id = 3;
3107
	engine->mmio_base = VEBOX_RING_BASE;
3108 3109 3110

	intel_ring_default_vfuncs(dev_priv, engine);

3111
	engine->flush = gen6_ring_flush;
3112

3113
	if (INTEL_GEN(dev_priv) >= 8) {
3114
		engine->irq_enable_mask =
3115
			GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
3116
	} else {
3117 3118 3119
		engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
		engine->irq_get = hsw_vebox_get_irq;
		engine->irq_put = hsw_vebox_put_irq;
3120
	}
B
Ben Widawsky 已提交
3121

3122
	return intel_init_ring_buffer(dev, engine);
B
Ben Widawsky 已提交
3123 3124
}

3125
int
3126
intel_ring_flush_all_caches(struct drm_i915_gem_request *req)
3127
{
3128
	struct intel_engine_cs *engine = req->engine;
3129 3130
	int ret;

3131
	if (!engine->gpu_caches_dirty)
3132 3133
		return 0;

3134
	ret = engine->flush(req, 0, I915_GEM_GPU_DOMAINS);
3135 3136 3137
	if (ret)
		return ret;

3138
	trace_i915_gem_ring_flush(req, 0, I915_GEM_GPU_DOMAINS);
3139

3140
	engine->gpu_caches_dirty = false;
3141 3142 3143 3144
	return 0;
}

int
3145
intel_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
3146
{
3147
	struct intel_engine_cs *engine = req->engine;
3148 3149 3150 3151
	uint32_t flush_domains;
	int ret;

	flush_domains = 0;
3152
	if (engine->gpu_caches_dirty)
3153 3154
		flush_domains = I915_GEM_GPU_DOMAINS;

3155
	ret = engine->flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3156 3157 3158
	if (ret)
		return ret;

3159
	trace_i915_gem_ring_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3160

3161
	engine->gpu_caches_dirty = false;
3162 3163
	return 0;
}
3164 3165

void
3166
intel_stop_engine(struct intel_engine_cs *engine)
3167 3168 3169
{
	int ret;

3170
	if (!intel_engine_initialized(engine))
3171 3172
		return;

3173
	ret = intel_engine_idle(engine);
3174
	if (ret)
3175
		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
3176
			  engine->name, ret);
3177

3178
	stop_ring(engine);
3179
}