intel_ringbuffer.c 82.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright © 2008-2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Zou Nan hai <nanhai.zou@intel.com>
 *    Xiang Hai hao<haihao.xiang@intel.com>
 *
 */

30
#include <linux/log2.h>
31
#include <drm/drmP.h>
32
#include "i915_drv.h"
33
#include <drm/i915_drm.h>
34
#include "i915_trace.h"
35
#include "intel_drv.h"
36

37 38 39 40 41
/* Rough estimate of the typical request size, performing a flush,
 * set-context and then emitting the batch.
 */
#define LEGACY_REQUEST_SIZE 200

42
int __intel_ring_space(int head, int tail, int size)
43
{
44 45
	int space = head - tail;
	if (space <= 0)
46
		space += size;
47
	return space - I915_RING_FREE_SPACE;
48 49
}

50 51 52 53 54 55 56 57 58 59 60
void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
{
	if (ringbuf->last_retired_head != -1) {
		ringbuf->head = ringbuf->last_retired_head;
		ringbuf->last_retired_head = -1;
	}

	ringbuf->space = __intel_ring_space(ringbuf->head & HEAD_ADDR,
					    ringbuf->tail, ringbuf->size);
}

61
static void __intel_ring_advance(struct intel_engine_cs *engine)
62
{
63
	struct intel_ringbuffer *ringbuf = engine->buffer;
64
	ringbuf->tail &= ringbuf->size - 1;
65
	engine->write_tail(engine, ringbuf->tail);
66 67
}

68
static int
69
gen2_render_ring_flush(struct drm_i915_gem_request *req,
70 71 72
		       u32	invalidate_domains,
		       u32	flush_domains)
{
73
	struct intel_engine_cs *engine = req->engine;
74 75 76 77
	u32 cmd;
	int ret;

	cmd = MI_FLUSH;
78
	if (((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER) == 0)
79 80 81 82 83
		cmd |= MI_NO_WRITE_FLUSH;

	if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER)
		cmd |= MI_READ_FLUSH;

84
	ret = intel_ring_begin(req, 2);
85 86 87
	if (ret)
		return ret;

88 89 90
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
91 92 93 94 95

	return 0;
}

static int
96
gen4_render_ring_flush(struct drm_i915_gem_request *req,
97 98
		       u32	invalidate_domains,
		       u32	flush_domains)
99
{
100
	struct intel_engine_cs *engine = req->engine;
101
	u32 cmd;
102
	int ret;
103

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
	/*
	 * read/write caches:
	 *
	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
	 * also flushed at 2d versus 3d pipeline switches.
	 *
	 * read-only caches:
	 *
	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
	 * MI_READ_FLUSH is set, and is always flushed on 965.
	 *
	 * I915_GEM_DOMAIN_COMMAND may not exist?
	 *
	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
	 * invalidated when MI_EXE_FLUSH is set.
	 *
	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
	 * invalidated with every MI_FLUSH.
	 *
	 * TLBs:
	 *
	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
	 * are flushed at any MI_FLUSH.
	 */

	cmd = MI_FLUSH | MI_NO_WRITE_FLUSH;
133
	if ((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER)
134 135 136
		cmd &= ~MI_NO_WRITE_FLUSH;
	if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
		cmd |= MI_EXE_FLUSH;
137

138
	if (invalidate_domains & I915_GEM_DOMAIN_COMMAND &&
139
	    (IS_G4X(req->i915) || IS_GEN5(req->i915)))
140
		cmd |= MI_INVALIDATE_ISP;
141

142
	ret = intel_ring_begin(req, 2);
143 144
	if (ret)
		return ret;
145

146 147 148
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
149 150

	return 0;
151 152
}

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
/**
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 * implementing two workarounds on gen6.  From section 1.4.7.1
 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 *
 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 * produced by non-pipelined state commands), software needs to first
 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 * 0.
 *
 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 *
 * And the workaround for these two requires this workaround first:
 *
 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 * BEFORE the pipe-control with a post-sync op and no write-cache
 * flushes.
 *
 * And this last workaround is tricky because of the requirements on
 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 * volume 2 part 1:
 *
 *     "1 of the following must also be set:
 *      - Render Target Cache Flush Enable ([12] of DW1)
 *      - Depth Cache Flush Enable ([0] of DW1)
 *      - Stall at Pixel Scoreboard ([1] of DW1)
 *      - Depth Stall ([13] of DW1)
 *      - Post-Sync Operation ([13] of DW1)
 *      - Notify Enable ([8] of DW1)"
 *
 * The cache flushes require the workaround flush that triggered this
 * one, so we can't use it.  Depth stall would trigger the same.
 * Post-sync nonzero is what triggered this second workaround, so we
 * can't use that one either.  Notify enable is IRQs, which aren't
 * really our business.  That leaves only stall at scoreboard.
 */
static int
191
intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
192
{
193
	struct intel_engine_cs *engine = req->engine;
194
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
195 196
	int ret;

197
	ret = intel_ring_begin(req, 6);
198 199 200
	if (ret)
		return ret;

201 202
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
203
			PIPE_CONTROL_STALL_AT_SCOREBOARD);
204 205 206 207 208
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0); /* low dword */
	intel_ring_emit(engine, 0); /* high dword */
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
209

210
	ret = intel_ring_begin(req, 6);
211 212 213
	if (ret)
		return ret;

214 215 216 217 218 219 220
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
	intel_ring_emit(engine, PIPE_CONTROL_QW_WRITE);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
221 222 223 224 225

	return 0;
}

static int
226 227
gen6_render_ring_flush(struct drm_i915_gem_request *req,
		       u32 invalidate_domains, u32 flush_domains)
228
{
229
	struct intel_engine_cs *engine = req->engine;
230
	u32 flags = 0;
231
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
232 233
	int ret;

234
	/* Force SNB workarounds for PIPE_CONTROL flushes */
235
	ret = intel_emit_post_sync_nonzero_flush(req);
236 237 238
	if (ret)
		return ret;

239 240 241 242
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
243 244 245 246 247 248 249
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		/*
		 * Ensure that any following seqno writes only happen
		 * when the render cache is indeed flushed.
		 */
250
		flags |= PIPE_CONTROL_CS_STALL;
251 252 253 254 255 256 257 258 259 260 261
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		/*
		 * TLB invalidate requires a post-sync write.
		 */
262
		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
263
	}
264

265
	ret = intel_ring_begin(req, 4);
266 267 268
	if (ret)
		return ret;

269 270 271 272 273
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
274 275 276 277

	return 0;
}

278
static int
279
gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
280
{
281
	struct intel_engine_cs *engine = req->engine;
282 283
	int ret;

284
	ret = intel_ring_begin(req, 4);
285 286 287
	if (ret)
		return ret;

288 289
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
290
			      PIPE_CONTROL_STALL_AT_SCOREBOARD);
291 292 293
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
294 295 296 297

	return 0;
}

298
static int
299
gen7_render_ring_flush(struct drm_i915_gem_request *req,
300 301
		       u32 invalidate_domains, u32 flush_domains)
{
302
	struct intel_engine_cs *engine = req->engine;
303
	u32 flags = 0;
304
	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
305 306
	int ret;

307 308 309 310 311 312 313 314 315 316
	/*
	 * Ensure that any following seqno writes only happen when the render
	 * cache is indeed flushed.
	 *
	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
	 * don't try to be clever and just set it unconditionally.
	 */
	flags |= PIPE_CONTROL_CS_STALL;

317 318 319 320 321 322 323
	/* Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
324
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
325
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
326 327 328 329 330 331 332 333
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
334
		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
335 336 337 338
		/*
		 * TLB invalidate requires a post-sync write.
		 */
		flags |= PIPE_CONTROL_QW_WRITE;
339
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
340

341 342
		flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;

343 344 345
		/* Workaround: we must issue a pipe_control with CS-stall bit
		 * set before a pipe_control command that has the state cache
		 * invalidate bit set. */
346
		gen7_render_ring_cs_stall_wa(req);
347 348
	}

349
	ret = intel_ring_begin(req, 4);
350 351 352
	if (ret)
		return ret;

353 354 355 356 357
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
358 359 360 361

	return 0;
}

362
static int
363
gen8_emit_pipe_control(struct drm_i915_gem_request *req,
364 365
		       u32 flags, u32 scratch_addr)
{
366
	struct intel_engine_cs *engine = req->engine;
367 368
	int ret;

369
	ret = intel_ring_begin(req, 6);
370 371 372
	if (ret)
		return ret;

373 374 375 376 377 378 379
	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, flags);
	intel_ring_emit(engine, scratch_addr);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, 0);
	intel_ring_advance(engine);
380 381 382 383

	return 0;
}

B
Ben Widawsky 已提交
384
static int
385
gen8_render_ring_flush(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
386 387 388
		       u32 invalidate_domains, u32 flush_domains)
{
	u32 flags = 0;
389
	u32 scratch_addr = req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
390
	int ret;
B
Ben Widawsky 已提交
391 392 393 394 395 396

	flags |= PIPE_CONTROL_CS_STALL;

	if (flush_domains) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
397
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
398
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
B
Ben Widawsky 已提交
399 400 401 402 403 404 405 406 407 408
	}
	if (invalidate_domains) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
409 410

		/* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
411
		ret = gen8_emit_pipe_control(req,
412 413 414 415 416
					     PIPE_CONTROL_CS_STALL |
					     PIPE_CONTROL_STALL_AT_SCOREBOARD,
					     0);
		if (ret)
			return ret;
B
Ben Widawsky 已提交
417 418
	}

419
	return gen8_emit_pipe_control(req, flags, scratch_addr);
B
Ben Widawsky 已提交
420 421
}

422
static void ring_write_tail(struct intel_engine_cs *engine,
423
			    u32 value)
424
{
425
	struct drm_i915_private *dev_priv = engine->i915;
426
	I915_WRITE_TAIL(engine, value);
427 428
}

429
u64 intel_ring_get_active_head(struct intel_engine_cs *engine)
430
{
431
	struct drm_i915_private *dev_priv = engine->i915;
432
	u64 acthd;
433

434
	if (INTEL_GEN(dev_priv) >= 8)
435 436
		acthd = I915_READ64_2x32(RING_ACTHD(engine->mmio_base),
					 RING_ACTHD_UDW(engine->mmio_base));
437
	else if (INTEL_GEN(dev_priv) >= 4)
438
		acthd = I915_READ(RING_ACTHD(engine->mmio_base));
439 440 441 442
	else
		acthd = I915_READ(ACTHD);

	return acthd;
443 444
}

445
static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
446
{
447
	struct drm_i915_private *dev_priv = engine->i915;
448 449 450
	u32 addr;

	addr = dev_priv->status_page_dmah->busaddr;
451
	if (INTEL_GEN(dev_priv) >= 4)
452 453 454 455
		addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0;
	I915_WRITE(HWS_PGA, addr);
}

456
static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
457
{
458
	struct drm_i915_private *dev_priv = engine->i915;
459
	i915_reg_t mmio;
460 461 462 463

	/* The ring status page addresses are no longer next to the rest of
	 * the ring registers as of gen7.
	 */
464
	if (IS_GEN7(dev_priv)) {
465
		switch (engine->id) {
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
		case RCS:
			mmio = RENDER_HWS_PGA_GEN7;
			break;
		case BCS:
			mmio = BLT_HWS_PGA_GEN7;
			break;
		/*
		 * VCS2 actually doesn't exist on Gen7. Only shut up
		 * gcc switch check warning
		 */
		case VCS2:
		case VCS:
			mmio = BSD_HWS_PGA_GEN7;
			break;
		case VECS:
			mmio = VEBOX_HWS_PGA_GEN7;
			break;
		}
484
	} else if (IS_GEN6(dev_priv)) {
485
		mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
486 487
	} else {
		/* XXX: gen8 returns to sanity */
488
		mmio = RING_HWS_PGA(engine->mmio_base);
489 490
	}

491
	I915_WRITE(mmio, (u32)engine->status_page.gfx_addr);
492 493 494 495 496 497 498 499 500
	POSTING_READ(mmio);

	/*
	 * Flush the TLB for this page
	 *
	 * FIXME: These two bits have disappeared on gen8, so a question
	 * arises: do we still need this and if so how should we go about
	 * invalidating the TLB?
	 */
501
	if (IS_GEN(dev_priv, 6, 7)) {
502
		i915_reg_t reg = RING_INSTPM(engine->mmio_base);
503 504

		/* ring should be idle before issuing a sync flush*/
505
		WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
506 507 508 509

		I915_WRITE(reg,
			   _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
					      INSTPM_SYNC_FLUSH));
510 511 512
		if (intel_wait_for_register(dev_priv,
					    reg, INSTPM_SYNC_FLUSH, 0,
					    1000))
513
			DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
514
				  engine->name);
515 516 517
	}
}

518
static bool stop_ring(struct intel_engine_cs *engine)
519
{
520
	struct drm_i915_private *dev_priv = engine->i915;
521

522
	if (!IS_GEN2(dev_priv)) {
523
		I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
524 525 526 527 528
		if (intel_wait_for_register(dev_priv,
					    RING_MI_MODE(engine->mmio_base),
					    MODE_IDLE,
					    MODE_IDLE,
					    1000)) {
529 530
			DRM_ERROR("%s : timed out trying to stop ring\n",
				  engine->name);
531 532 533 534
			/* Sometimes we observe that the idle flag is not
			 * set even though the ring is empty. So double
			 * check before giving up.
			 */
535
			if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine))
536
				return false;
537 538
		}
	}
539

540 541 542
	I915_WRITE_CTL(engine, 0);
	I915_WRITE_HEAD(engine, 0);
	engine->write_tail(engine, 0);
543

544
	if (!IS_GEN2(dev_priv)) {
545 546
		(void)I915_READ_CTL(engine);
		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
547
	}
548

549
	return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
550
}
551

552
static int init_ring_common(struct intel_engine_cs *engine)
553
{
554
	struct drm_i915_private *dev_priv = engine->i915;
555
	struct intel_ringbuffer *ringbuf = engine->buffer;
556
	struct drm_i915_gem_object *obj = ringbuf->obj;
557 558
	int ret = 0;

559
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
560

561
	if (!stop_ring(engine)) {
562
		/* G45 ring initialization often fails to reset head to zero */
563 564
		DRM_DEBUG_KMS("%s head not reset to zero "
			      "ctl %08x head %08x tail %08x start %08x\n",
565 566 567 568 569
			      engine->name,
			      I915_READ_CTL(engine),
			      I915_READ_HEAD(engine),
			      I915_READ_TAIL(engine),
			      I915_READ_START(engine));
570

571
		if (!stop_ring(engine)) {
572 573
			DRM_ERROR("failed to set %s head to zero "
				  "ctl %08x head %08x tail %08x start %08x\n",
574 575 576 577 578
				  engine->name,
				  I915_READ_CTL(engine),
				  I915_READ_HEAD(engine),
				  I915_READ_TAIL(engine),
				  I915_READ_START(engine));
579 580
			ret = -EIO;
			goto out;
581
		}
582 583
	}

584
	if (I915_NEED_GFX_HWS(dev_priv))
585
		intel_ring_setup_status_page(engine);
586
	else
587
		ring_setup_phys_status_page(engine);
588

589
	/* Enforce ordering by reading HEAD register back */
590
	I915_READ_HEAD(engine);
591

592 593 594 595
	/* Initialize the ring. This must happen _after_ we've cleared the ring
	 * registers with the above sequence (the readback of the HEAD registers
	 * also enforces ordering), otherwise the hw might lose the new ring
	 * register values. */
596
	I915_WRITE_START(engine, i915_gem_obj_ggtt_offset(obj));
597 598

	/* WaClearRingBufHeadRegAtInit:ctg,elk */
599
	if (I915_READ_HEAD(engine))
600
		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
601 602 603
			  engine->name, I915_READ_HEAD(engine));
	I915_WRITE_HEAD(engine, 0);
	(void)I915_READ_HEAD(engine);
604

605
	I915_WRITE_CTL(engine,
606
			((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES)
607
			| RING_VALID);
608 609

	/* If the head is still not zero, the ring is dead */
610 611 612
	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
		     I915_READ_START(engine) == i915_gem_obj_ggtt_offset(obj) &&
		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
613
		DRM_ERROR("%s initialization failed "
614
			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08lx]\n",
615 616 617 618 619 620
			  engine->name,
			  I915_READ_CTL(engine),
			  I915_READ_CTL(engine) & RING_VALID,
			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
			  I915_READ_START(engine),
			  (unsigned long)i915_gem_obj_ggtt_offset(obj));
621 622
		ret = -EIO;
		goto out;
623 624
	}

625
	ringbuf->last_retired_head = -1;
626 627
	ringbuf->head = I915_READ_HEAD(engine);
	ringbuf->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
628
	intel_ring_update_space(ringbuf);
629

630
	intel_engine_init_hangcheck(engine);
631

632
out:
633
	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
634 635

	return ret;
636 637
}

638
void intel_fini_pipe_control(struct intel_engine_cs *engine)
639
{
640
	if (engine->scratch.obj == NULL)
641 642
		return;

643
	i915_gem_object_ggtt_unpin(engine->scratch.obj);
644
	i915_gem_object_put(engine->scratch.obj);
645
	engine->scratch.obj = NULL;
646 647
}

648
int intel_init_pipe_control(struct intel_engine_cs *engine, int size)
649
{
650
	struct drm_i915_gem_object *obj;
651 652
	int ret;

653
	WARN_ON(engine->scratch.obj);
654

655
	obj = i915_gem_object_create_stolen(&engine->i915->drm, size);
656
	if (!obj)
657
		obj = i915_gem_object_create(&engine->i915->drm, size);
658 659 660
	if (IS_ERR(obj)) {
		DRM_ERROR("Failed to allocate scratch page\n");
		ret = PTR_ERR(obj);
661 662
		goto err;
	}
663

664
	ret = i915_gem_obj_ggtt_pin(obj, 4096, PIN_HIGH);
665 666
	if (ret)
		goto err_unref;
667

668 669
	engine->scratch.obj = obj;
	engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(obj);
670
	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
671
			 engine->name, engine->scratch.gtt_offset);
672 673 674
	return 0;

err_unref:
675
	i915_gem_object_put(engine->scratch.obj);
676 677 678 679
err:
	return ret;
}

680
static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
681
{
682
	struct intel_engine_cs *engine = req->engine;
683 684
	struct i915_workarounds *w = &req->i915->workarounds;
	int ret, i;
685

686
	if (w->count == 0)
687
		return 0;
688

689
	engine->gpu_caches_dirty = true;
690
	ret = intel_ring_flush_all_caches(req);
691 692
	if (ret)
		return ret;
693

694
	ret = intel_ring_begin(req, (w->count * 2 + 2));
695 696 697
	if (ret)
		return ret;

698
	intel_ring_emit(engine, MI_LOAD_REGISTER_IMM(w->count));
699
	for (i = 0; i < w->count; i++) {
700 701
		intel_ring_emit_reg(engine, w->reg[i].addr);
		intel_ring_emit(engine, w->reg[i].value);
702
	}
703
	intel_ring_emit(engine, MI_NOOP);
704

705
	intel_ring_advance(engine);
706

707
	engine->gpu_caches_dirty = true;
708
	ret = intel_ring_flush_all_caches(req);
709 710
	if (ret)
		return ret;
711

712
	DRM_DEBUG_DRIVER("Number of Workarounds emitted: %d\n", w->count);
713

714
	return 0;
715 716
}

717
static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
718 719 720
{
	int ret;

721
	ret = intel_ring_workarounds_emit(req);
722 723 724
	if (ret != 0)
		return ret;

725
	ret = i915_gem_render_state_init(req);
726
	if (ret)
727
		return ret;
728

729
	return 0;
730 731
}

732
static int wa_add(struct drm_i915_private *dev_priv,
733 734
		  i915_reg_t addr,
		  const u32 mask, const u32 val)
735 736 737 738 739 740 741 742 743 744 745 746 747
{
	const u32 idx = dev_priv->workarounds.count;

	if (WARN_ON(idx >= I915_MAX_WA_REGS))
		return -ENOSPC;

	dev_priv->workarounds.reg[idx].addr = addr;
	dev_priv->workarounds.reg[idx].value = val;
	dev_priv->workarounds.reg[idx].mask = mask;

	dev_priv->workarounds.count++;

	return 0;
748 749
}

750
#define WA_REG(addr, mask, val) do { \
751
		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
752 753
		if (r) \
			return r; \
754
	} while (0)
755 756

#define WA_SET_BIT_MASKED(addr, mask) \
757
	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
758 759

#define WA_CLR_BIT_MASKED(addr, mask) \
760
	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
761

762
#define WA_SET_FIELD_MASKED(addr, mask, value) \
763
	WA_REG(addr, mask, _MASKED_FIELD(mask, value))
764

765 766
#define WA_SET_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) | (mask))
#define WA_CLR_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) & ~(mask))
767

768
#define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val)
769

770 771
static int wa_ring_whitelist_reg(struct intel_engine_cs *engine,
				 i915_reg_t reg)
772
{
773
	struct drm_i915_private *dev_priv = engine->i915;
774
	struct i915_workarounds *wa = &dev_priv->workarounds;
775
	const uint32_t index = wa->hw_whitelist_count[engine->id];
776 777 778 779

	if (WARN_ON(index >= RING_MAX_NONPRIV_SLOTS))
		return -EINVAL;

780
	WA_WRITE(RING_FORCE_TO_NONPRIV(engine->mmio_base, index),
781
		 i915_mmio_reg_offset(reg));
782
	wa->hw_whitelist_count[engine->id]++;
783 784 785 786

	return 0;
}

787
static int gen8_init_workarounds(struct intel_engine_cs *engine)
788
{
789
	struct drm_i915_private *dev_priv = engine->i915;
790 791

	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
792

793 794 795
	/* WaDisableAsyncFlipPerfMode:bdw,chv */
	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);

796 797 798 799
	/* WaDisablePartialInstShootdown:bdw,chv */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

800 801 802 803 804
	/* Use Force Non-Coherent whenever executing a 3D context. This is a
	 * workaround for for a possible hang in the unlikely event a TLB
	 * invalidation occurs during a PSD flush.
	 */
	/* WaForceEnableNonCoherent:bdw,chv */
805
	/* WaHdcDisableFetchWhenMasked:bdw,chv */
806
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
807
			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
808 809
			  HDC_FORCE_NON_COHERENT);

810 811 812 813 814 815 816 817 818 819
	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
	 *  polygons in the same 8x4 pixel/sample area to be processed without
	 *  stalling waiting for the earlier ones to write to Hierarchical Z
	 *  buffer."
	 *
	 * This optimization is off by default for BDW and CHV; turn it on.
	 */
	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);

820 821 822
	/* Wa4x4STCOptimizationDisable:bdw,chv */
	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);

823 824 825 826 827 828 829 830 831 832 833 834
	/*
	 * BSpec recommends 8x4 when MSAA is used,
	 * however in practice 16x4 seems fastest.
	 *
	 * Note that PS/WM thread counts depend on the WIZ hashing
	 * disable bit, which we don't touch here, but it's good
	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
	 */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN6_WIZ_HASHING_MASK,
			    GEN6_WIZ_HASHING_16x4);

835 836 837
	return 0;
}

838
static int bdw_init_workarounds(struct intel_engine_cs *engine)
839
{
840
	struct drm_i915_private *dev_priv = engine->i915;
841
	int ret;
842

843
	ret = gen8_init_workarounds(engine);
844 845 846
	if (ret)
		return ret;

847
	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
848
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
849

850
	/* WaDisableDopClockGating:bdw */
851 852
	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
			  DOP_CLOCK_GATING_DISABLE);
853

854 855
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN8_SAMPLER_POWER_BYPASS_DIS);
856

857
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
858 859 860
			  /* WaForceContextSaveRestoreNonCoherent:bdw */
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
861
			  (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
862 863 864 865

	return 0;
}

866
static int chv_init_workarounds(struct intel_engine_cs *engine)
867
{
868
	struct drm_i915_private *dev_priv = engine->i915;
869
	int ret;
870

871
	ret = gen8_init_workarounds(engine);
872 873 874
	if (ret)
		return ret;

875
	/* WaDisableThreadStallDopClockGating:chv */
876
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
877

878 879 880
	/* Improve HiZ throughput on CHV. */
	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);

881 882 883
	return 0;
}

884
static int gen9_init_workarounds(struct intel_engine_cs *engine)
885
{
886
	struct drm_i915_private *dev_priv = engine->i915;
887
	int ret;
888

889 890 891
	/* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl */
	I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));

892
	/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl */
893 894 895
	I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
		   GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);

896
	/* WaDisableKillLogic:bxt,skl,kbl */
897 898 899
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   ECOCHK_DIS_TLB);

900 901
	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl */
	/* WaDisablePartialInstShootdown:skl,bxt,kbl */
902
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
903
			  FLOW_CONTROL_ENABLE |
904 905
			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);

906
	/* Syncing dependencies between camera and graphics:skl,bxt,kbl */
907 908 909
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
			  GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);

910
	/* WaDisableDgMirrorFixInHalfSliceChicken5:skl,bxt */
911 912
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
913 914
		WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
				  GEN9_DG_MIRROR_FIX_ENABLE);
915

916
	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
917 918
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
919 920
		WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
				  GEN9_RHWO_OPTIMIZATION_DISABLE);
921 922 923 924 925
		/*
		 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
		 * but we do that in per ctx batchbuffer as there is an issue
		 * with this register not getting restored on ctx restore
		 */
926 927
	}

928 929
	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl */
	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl */
930 931 932
	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
			  GEN9_ENABLE_YV12_BUGFIX |
			  GEN9_ENABLE_GPGPU_PREEMPTION);
933

934 935
	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl */
	/* WaDisablePartialResolveInVc:skl,bxt,kbl */
936 937
	WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
					 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
938

939
	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl */
940 941 942
	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
			  GEN9_CCS_TLB_PREFETCH_ENABLE);

943
	/* WaDisableMaskBasedCammingInRCC:skl,bxt */
944 945
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_C0) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
946 947 948
		WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
				  PIXEL_MASK_CAMMING_DISABLE);

949 950 951 952
	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
953

954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
	 * both tied to WaForceContextSaveRestoreNonCoherent
	 * in some hsds for skl. We keep the tie for all gen9. The
	 * documentation is a bit hazy and so we want to get common behaviour,
	 * even though there is no clear evidence we would need both on kbl/bxt.
	 * This area has been source of system hangs so we play it safe
	 * and mimic the skl regardless of what bspec says.
	 *
	 * Use Force Non-Coherent whenever executing a 3D context. This
	 * is a workaround for a possible hang in the unlikely event
	 * a TLB invalidation occurs during a PSD flush.
	 */

	/* WaForceEnableNonCoherent:skl,bxt,kbl */
	WA_SET_BIT_MASKED(HDC_CHICKEN0,
			  HDC_FORCE_NON_COHERENT);

	/* WaDisableHDCInvalidation:skl,bxt,kbl */
	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
		   BDW_DISABLE_HDC_INVALIDATION);

975 976 977 978
	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl */
	if (IS_SKYLAKE(dev_priv) ||
	    IS_KABYLAKE(dev_priv) ||
	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
979 980 981
		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
				  GEN8_SAMPLER_POWER_BYPASS_DIS);

982
	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl */
983 984
	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);

985
	/* WaOCLCoherentLineFlush:skl,bxt,kbl */
986 987 988
	I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) |
				    GEN8_LQSC_FLUSH_COHERENT_LINES));

989 990 991 992 993
	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */
	ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
	if (ret)
		return ret;

994
	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl */
995
	ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1);
996 997 998
	if (ret)
		return ret;

999
	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl */
1000
	ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1);
1001 1002 1003
	if (ret)
		return ret;

1004 1005 1006
	return 0;
}

1007
static int skl_tune_iz_hashing(struct intel_engine_cs *engine)
1008
{
1009
	struct drm_i915_private *dev_priv = engine->i915;
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
	u8 vals[3] = { 0, 0, 0 };
	unsigned int i;

	for (i = 0; i < 3; i++) {
		u8 ss;

		/*
		 * Only consider slices where one, and only one, subslice has 7
		 * EUs
		 */
1020
		if (!is_power_of_2(dev_priv->info.subslice_7eu[i]))
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
			continue;

		/*
		 * subslice_7eu[i] != 0 (because of the check above) and
		 * ss_max == 4 (maximum number of subslices possible per slice)
		 *
		 * ->    0 <= ss <= 3;
		 */
		ss = ffs(dev_priv->info.subslice_7eu[i]) - 1;
		vals[i] = 3 - ss;
	}

	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
		return 0;

	/* Tune IZ hashing. See intel_device_info_runtime_init() */
	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
			    GEN9_IZ_HASHING_MASK(2) |
			    GEN9_IZ_HASHING_MASK(1) |
			    GEN9_IZ_HASHING_MASK(0),
			    GEN9_IZ_HASHING(2, vals[2]) |
			    GEN9_IZ_HASHING(1, vals[1]) |
			    GEN9_IZ_HASHING(0, vals[0]));

	return 0;
}

1048
static int skl_init_workarounds(struct intel_engine_cs *engine)
1049
{
1050
	struct drm_i915_private *dev_priv = engine->i915;
1051
	int ret;
1052

1053
	ret = gen9_init_workarounds(engine);
1054 1055
	if (ret)
		return ret;
1056

1057 1058 1059 1060 1061
	/*
	 * Actual WA is to disable percontext preemption granularity control
	 * until D0 which is the default case so this is equivalent to
	 * !WaDisablePerCtxtPreemptionGranularityControl:skl
	 */
1062
	if (IS_SKL_REVID(dev_priv, SKL_REVID_E0, REVID_FOREVER)) {
1063 1064 1065 1066
		I915_WRITE(GEN7_FF_SLICE_CS_CHICKEN1,
			   _MASKED_BIT_ENABLE(GEN9_FFSC_PERCTX_PREEMPT_CTRL));
	}

1067
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0)) {
1068 1069 1070 1071 1072 1073 1074 1075
		/* WaDisableChickenBitTSGBarrierAckForFFSliceCS:skl */
		I915_WRITE(FF_SLICE_CS_CHICKEN2,
			   _MASKED_BIT_ENABLE(GEN9_TSG_BARRIER_ACK_DISABLE));
	}

	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
1076
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0))
1077 1078 1079 1080 1081
		/* WaDisableLSQCROPERFforOCL:skl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

	/* WaEnableGapsTsvCreditFix:skl */
1082
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, REVID_FOREVER)) {
1083 1084 1085 1086
		I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
					   GEN9_GAPS_TSV_CREDIT_DISABLE));
	}

1087
	/* WaDisablePowerCompilerClockGating:skl */
1088
	if (IS_SKL_REVID(dev_priv, SKL_REVID_B0, SKL_REVID_B0))
1089 1090 1091
		WA_SET_BIT_MASKED(HIZ_CHICKEN,
				  BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);

1092
	/* WaBarrierPerformanceFixDisable:skl */
1093
	if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_D0))
1094 1095 1096 1097
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE |
				  HDC_BARRIER_PERFORMANCE_DISABLE);

1098
	/* WaDisableSbeCacheDispatchPortSharing:skl */
1099
	if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_F0))
1100 1101 1102 1103
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1104 1105 1106
	/* WaDisableGafsUnitClkGating:skl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1107 1108 1109 1110 1111
	/* WaInPlaceDecompressionHang:skl */
	if (IS_SKL_REVID(dev_priv, SKL_REVID_H0, REVID_FOREVER))
		WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA,
			   GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);

1112
	/* WaDisableLSQCROPERFforOCL:skl */
1113
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1114 1115 1116
	if (ret)
		return ret;

1117
	return skl_tune_iz_hashing(engine);
1118 1119
}

1120
static int bxt_init_workarounds(struct intel_engine_cs *engine)
1121
{
1122
	struct drm_i915_private *dev_priv = engine->i915;
1123
	int ret;
1124

1125
	ret = gen9_init_workarounds(engine);
1126 1127
	if (ret)
		return ret;
1128

1129 1130
	/* WaStoreMultiplePTEenable:bxt */
	/* This is a requirement according to Hardware specification */
1131
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
1132 1133 1134
		I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF);

	/* WaSetClckGatingDisableMedia:bxt */
1135
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1136 1137 1138 1139
		I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
					    ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE));
	}

1140 1141 1142 1143
	/* WaDisableThreadStallDopClockGating:bxt */
	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
			  STALL_DOP_GATING_DISABLE);

1144 1145 1146 1147 1148 1149
	/* WaDisablePooledEuLoadBalancingFix:bxt */
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) {
		WA_SET_BIT_MASKED(FF_SLICE_CS_CHICKEN2,
				  GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
	}

1150
	/* WaDisableSbeCacheDispatchPortSharing:bxt */
1151
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) {
1152 1153 1154 1155 1156
		WA_SET_BIT_MASKED(
			GEN7_HALF_SLICE_CHICKEN1,
			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
	}

1157 1158 1159
	/* WaDisableObjectLevelPreemptionForTrifanOrPolygon:bxt */
	/* WaDisableObjectLevelPreemptionForInstancedDraw:bxt */
	/* WaDisableObjectLevelPreemtionForInstanceId:bxt */
1160
	/* WaDisableLSQCROPERFforOCL:bxt */
1161
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1162
		ret = wa_ring_whitelist_reg(engine, GEN9_CS_DEBUG_MODE1);
1163 1164
		if (ret)
			return ret;
1165

1166
		ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1167 1168
		if (ret)
			return ret;
1169 1170
	}

1171
	/* WaProgramL3SqcReg1DefaultForPerf:bxt */
1172
	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
1173 1174
		I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
					   L3_HIGH_PRIO_CREDITS(2));
1175

1176 1177 1178 1179 1180
	/* WaInsertDummyPushConstPs:bxt */
	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1181 1182 1183 1184 1185
	/* WaInPlaceDecompressionHang:bxt */
	if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
		WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA,
			   GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);

1186 1187 1188
	return 0;
}

1189 1190
static int kbl_init_workarounds(struct intel_engine_cs *engine)
{
1191
	struct drm_i915_private *dev_priv = engine->i915;
1192 1193 1194 1195 1196 1197
	int ret;

	ret = gen9_init_workarounds(engine);
	if (ret)
		return ret;

1198 1199 1200 1201
	/* WaEnableGapsTsvCreditFix:kbl */
	I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
				   GEN9_GAPS_TSV_CREDIT_DISABLE));

1202 1203 1204 1205 1206
	/* WaDisableDynamicCreditSharing:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT(GAMT_CHKN_BIT_REG,
			   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);

1207 1208 1209 1210 1211
	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
		WA_SET_BIT_MASKED(HDC_CHICKEN0,
				  HDC_FENCE_DEST_SLM_DISABLE);

1212 1213 1214 1215 1216 1217 1218 1219
	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
	 * involving this register should also be added to WA batch as required.
	 */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_E0))
		/* WaDisableLSQCROPERFforOCL:kbl */
		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
			   GEN8_LQSC_RO_PERF_DIS);

1220 1221 1222 1223 1224
	/* WaInsertDummyPushConstPs:kbl */
	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

1225 1226 1227
	/* WaDisableGafsUnitClkGating:kbl */
	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);

1228 1229 1230 1231 1232
	/* WaDisableSbeCacheDispatchPortSharing:kbl */
	WA_SET_BIT_MASKED(
		GEN7_HALF_SLICE_CHICKEN1,
		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);

1233 1234 1235 1236
	/* WaInPlaceDecompressionHang:kbl */
	WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA,
		   GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);

1237 1238 1239 1240 1241
	/* WaDisableLSQCROPERFforOCL:kbl */
	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
	if (ret)
		return ret;

1242 1243 1244
	return 0;
}

1245
int init_workarounds_ring(struct intel_engine_cs *engine)
1246
{
1247
	struct drm_i915_private *dev_priv = engine->i915;
1248

1249
	WARN_ON(engine->id != RCS);
1250 1251

	dev_priv->workarounds.count = 0;
1252
	dev_priv->workarounds.hw_whitelist_count[RCS] = 0;
1253

1254
	if (IS_BROADWELL(dev_priv))
1255
		return bdw_init_workarounds(engine);
1256

1257
	if (IS_CHERRYVIEW(dev_priv))
1258
		return chv_init_workarounds(engine);
1259

1260
	if (IS_SKYLAKE(dev_priv))
1261
		return skl_init_workarounds(engine);
1262

1263
	if (IS_BROXTON(dev_priv))
1264
		return bxt_init_workarounds(engine);
1265

1266 1267 1268
	if (IS_KABYLAKE(dev_priv))
		return kbl_init_workarounds(engine);

1269 1270 1271
	return 0;
}

1272
static int init_render_ring(struct intel_engine_cs *engine)
1273
{
1274
	struct drm_i915_private *dev_priv = engine->i915;
1275
	int ret = init_ring_common(engine);
1276 1277
	if (ret)
		return ret;
1278

1279
	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1280
	if (IS_GEN(dev_priv, 4, 6))
1281
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
1282 1283 1284 1285

	/* We need to disable the AsyncFlip performance optimisations in order
	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
	 * programmed to '1' on all products.
1286
	 *
1287
	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1288
	 */
1289
	if (IS_GEN(dev_priv, 6, 7))
1290 1291
		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));

1292
	/* Required for the hardware to program scanline values for waiting */
1293
	/* WaEnableFlushTlbInvalidationMode:snb */
1294
	if (IS_GEN6(dev_priv))
1295
		I915_WRITE(GFX_MODE,
1296
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
1297

1298
	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1299
	if (IS_GEN7(dev_priv))
1300
		I915_WRITE(GFX_MODE_GEN7,
1301
			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
1302
			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
1303

1304
	if (IS_GEN6(dev_priv)) {
1305 1306 1307 1308 1309 1310
		/* From the Sandybridge PRM, volume 1 part 3, page 24:
		 * "If this bit is set, STCunit will have LRA as replacement
		 *  policy. [...] This bit must be reset.  LRA replacement
		 *  policy is not supported."
		 */
		I915_WRITE(CACHE_MODE_0,
1311
			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
1312 1313
	}

1314
	if (IS_GEN(dev_priv, 6, 7))
1315
		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1316

1317 1318
	if (INTEL_INFO(dev_priv)->gen >= 6)
		I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1319

1320
	return init_workarounds_ring(engine);
1321 1322
}

1323
static void render_ring_cleanup(struct intel_engine_cs *engine)
1324
{
1325
	struct drm_i915_private *dev_priv = engine->i915;
1326 1327 1328

	if (dev_priv->semaphore_obj) {
		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
1329
		i915_gem_object_put(dev_priv->semaphore_obj);
1330 1331
		dev_priv->semaphore_obj = NULL;
	}
1332

1333
	intel_fini_pipe_control(engine);
1334 1335
}

1336
static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
1337 1338 1339
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 8
1340
	struct intel_engine_cs *signaller = signaller_req->engine;
1341
	struct drm_i915_private *dev_priv = signaller_req->i915;
1342
	struct intel_engine_cs *waiter;
1343 1344
	enum intel_engine_id id;
	int ret, num_rings;
1345

1346
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1347 1348 1349
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1350
	ret = intel_ring_begin(signaller_req, num_dwords);
1351 1352 1353
	if (ret)
		return ret;

1354 1355
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1356 1357 1358 1359 1360 1361
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
		intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
					   PIPE_CONTROL_QW_WRITE |
1362
					   PIPE_CONTROL_CS_STALL);
1363 1364
		intel_ring_emit(signaller, lower_32_bits(gtt_offset));
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1365
		intel_ring_emit(signaller, signaller_req->fence.seqno);
1366 1367
		intel_ring_emit(signaller, 0);
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1368
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1369 1370 1371 1372 1373 1374
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1375
static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
1376 1377 1378
			   unsigned int num_dwords)
{
#define MBOX_UPDATE_DWORDS 6
1379
	struct intel_engine_cs *signaller = signaller_req->engine;
1380
	struct drm_i915_private *dev_priv = signaller_req->i915;
1381
	struct intel_engine_cs *waiter;
1382 1383
	enum intel_engine_id id;
	int ret, num_rings;
1384

1385
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1386 1387 1388
	num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
#undef MBOX_UPDATE_DWORDS

1389
	ret = intel_ring_begin(signaller_req, num_dwords);
1390 1391 1392
	if (ret)
		return ret;

1393 1394
	for_each_engine_id(waiter, dev_priv, id) {
		u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1395 1396 1397 1398 1399 1400 1401 1402
		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
			continue;

		intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
					   MI_FLUSH_DW_OP_STOREDW);
		intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
					   MI_FLUSH_DW_USE_GTT);
		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1403
		intel_ring_emit(signaller, signaller_req->fence.seqno);
1404
		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1405
					   MI_SEMAPHORE_TARGET(waiter->hw_id));
1406 1407 1408 1409 1410 1411
		intel_ring_emit(signaller, 0);
	}

	return 0;
}

1412
static int gen6_signal(struct drm_i915_gem_request *signaller_req,
1413
		       unsigned int num_dwords)
1414
{
1415
	struct intel_engine_cs *signaller = signaller_req->engine;
1416
	struct drm_i915_private *dev_priv = signaller_req->i915;
1417
	struct intel_engine_cs *useless;
1418 1419
	enum intel_engine_id id;
	int ret, num_rings;
1420

1421
#define MBOX_UPDATE_DWORDS 3
1422
	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1423 1424
	num_dwords += round_up((num_rings-1) * MBOX_UPDATE_DWORDS, 2);
#undef MBOX_UPDATE_DWORDS
1425

1426
	ret = intel_ring_begin(signaller_req, num_dwords);
1427 1428 1429
	if (ret)
		return ret;

1430 1431
	for_each_engine_id(useless, dev_priv, id) {
		i915_reg_t mbox_reg = signaller->semaphore.mbox.signal[id];
1432 1433

		if (i915_mmio_reg_valid(mbox_reg)) {
1434
			intel_ring_emit(signaller, MI_LOAD_REGISTER_IMM(1));
1435
			intel_ring_emit_reg(signaller, mbox_reg);
1436
			intel_ring_emit(signaller, signaller_req->fence.seqno);
1437 1438
		}
	}
1439

1440 1441 1442 1443
	/* If num_dwords was rounded, make sure the tail pointer is correct */
	if (num_rings % 2 == 0)
		intel_ring_emit(signaller, MI_NOOP);

1444
	return 0;
1445 1446
}

1447 1448
/**
 * gen6_add_request - Update the semaphore mailbox registers
1449 1450
 *
 * @request - request to write to the ring
1451 1452 1453 1454
 *
 * Update the mailbox registers in the *other* rings with the current seqno.
 * This acts like a signal in the canonical semaphore.
 */
1455
static int
1456
gen6_add_request(struct drm_i915_gem_request *req)
1457
{
1458
	struct intel_engine_cs *engine = req->engine;
1459
	int ret;
1460

1461 1462
	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 4);
B
Ben Widawsky 已提交
1463
	else
1464
		ret = intel_ring_begin(req, 4);
B
Ben Widawsky 已提交
1465

1466 1467 1468
	if (ret)
		return ret;

1469 1470 1471
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1472
	intel_ring_emit(engine, req->fence.seqno);
1473 1474
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1475 1476 1477 1478

	return 0;
}

1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507
static int
gen8_render_add_request(struct drm_i915_gem_request *req)
{
	struct intel_engine_cs *engine = req->engine;
	int ret;

	if (engine->semaphore.signal)
		ret = engine->semaphore.signal(req, 8);
	else
		ret = intel_ring_begin(req, 8);
	if (ret)
		return ret;

	intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
	intel_ring_emit(engine, (PIPE_CONTROL_GLOBAL_GTT_IVB |
				 PIPE_CONTROL_CS_STALL |
				 PIPE_CONTROL_QW_WRITE));
	intel_ring_emit(engine, intel_hws_seqno_address(req->engine));
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, i915_gem_request_get_seqno(req));
	/* We're thrashing one dword of HWS. */
	intel_ring_emit(engine, 0);
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	intel_ring_emit(engine, MI_NOOP);
	__intel_ring_advance(engine);

	return 0;
}

1508
static inline bool i915_gem_has_seqno_wrapped(struct drm_i915_private *dev_priv,
1509 1510 1511 1512 1513
					      u32 seqno)
{
	return dev_priv->last_seqno < seqno;
}

1514 1515 1516 1517 1518 1519 1520
/**
 * intel_ring_sync - sync the waiter to the signaller on seqno
 *
 * @waiter - ring that is waiting
 * @signaller - ring which has, or will signal
 * @seqno - seqno which the waiter will block on
 */
1521 1522

static int
1523
gen8_ring_sync(struct drm_i915_gem_request *waiter_req,
1524 1525 1526
	       struct intel_engine_cs *signaller,
	       u32 seqno)
{
1527
	struct intel_engine_cs *waiter = waiter_req->engine;
1528
	struct drm_i915_private *dev_priv = waiter_req->i915;
1529
	u64 offset = GEN8_WAIT_OFFSET(waiter, signaller->id);
1530
	struct i915_hw_ppgtt *ppgtt;
1531 1532
	int ret;

1533
	ret = intel_ring_begin(waiter_req, 4);
1534 1535 1536 1537 1538 1539 1540
	if (ret)
		return ret;

	intel_ring_emit(waiter, MI_SEMAPHORE_WAIT |
				MI_SEMAPHORE_GLOBAL_GTT |
				MI_SEMAPHORE_SAD_GTE_SDD);
	intel_ring_emit(waiter, seqno);
1541 1542
	intel_ring_emit(waiter, lower_32_bits(offset));
	intel_ring_emit(waiter, upper_32_bits(offset));
1543
	intel_ring_advance(waiter);
1544 1545 1546 1547 1548 1549 1550 1551 1552

	/* When the !RCS engines idle waiting upon a semaphore, they lose their
	 * pagetables and we must reload them before executing the batch.
	 * We do this on the i915_switch_context() following the wait and
	 * before the dispatch.
	 */
	ppgtt = waiter_req->ctx->ppgtt;
	if (ppgtt && waiter_req->engine->id != RCS)
		ppgtt->pd_dirty_rings |= intel_engine_flag(waiter_req->engine);
1553 1554 1555
	return 0;
}

1556
static int
1557
gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
1558
	       struct intel_engine_cs *signaller,
1559
	       u32 seqno)
1560
{
1561
	struct intel_engine_cs *waiter = waiter_req->engine;
1562 1563 1564
	u32 dw1 = MI_SEMAPHORE_MBOX |
		  MI_SEMAPHORE_COMPARE |
		  MI_SEMAPHORE_REGISTER;
1565 1566
	u32 wait_mbox = signaller->semaphore.mbox.wait[waiter->id];
	int ret;
1567

1568 1569 1570 1571 1572 1573
	/* Throughout all of the GEM code, seqno passed implies our current
	 * seqno is >= the last seqno executed. However for hardware the
	 * comparison is strictly greater than.
	 */
	seqno -= 1;

1574
	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
1575

1576
	ret = intel_ring_begin(waiter_req, 4);
1577 1578 1579
	if (ret)
		return ret;

1580
	/* If seqno wrap happened, omit the wait with no-ops */
1581
	if (likely(!i915_gem_has_seqno_wrapped(waiter_req->i915, seqno))) {
1582
		intel_ring_emit(waiter, dw1 | wait_mbox);
1583 1584 1585 1586 1587 1588 1589 1590 1591
		intel_ring_emit(waiter, seqno);
		intel_ring_emit(waiter, 0);
		intel_ring_emit(waiter, MI_NOOP);
	} else {
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
		intel_ring_emit(waiter, MI_NOOP);
	}
1592
	intel_ring_advance(waiter);
1593 1594 1595 1596

	return 0;
}

1597 1598
static void
gen5_seqno_barrier(struct intel_engine_cs *ring)
1599
{
1600 1601 1602
	/* MI_STORE are internally buffered by the GPU and not flushed
	 * either by MI_FLUSH or SyncFlush or any other combination of
	 * MI commands.
1603
	 *
1604 1605 1606 1607 1608 1609 1610
	 * "Only the submission of the store operation is guaranteed.
	 * The write result will be complete (coherent) some time later
	 * (this is practically a finite period but there is no guaranteed
	 * latency)."
	 *
	 * Empirically, we observe that we need a delay of at least 75us to
	 * be sure that the seqno write is visible by the CPU.
1611
	 */
1612
	usleep_range(125, 250);
1613 1614
}

1615 1616
static void
gen6_seqno_barrier(struct intel_engine_cs *engine)
1617
{
1618
	struct drm_i915_private *dev_priv = engine->i915;
1619

1620 1621
	/* Workaround to force correct ordering between irq and seqno writes on
	 * ivb (and maybe also on snb) by reading from a CS register (like
1622 1623 1624 1625 1626 1627 1628 1629 1630
	 * ACTHD) before reading the status page.
	 *
	 * Note that this effectively stalls the read by the time it takes to
	 * do a memory transaction, which more or less ensures that the write
	 * from the GPU has sufficient time to invalidate the CPU cacheline.
	 * Alternatively we could delay the interrupt from the CS ring to give
	 * the write time to land, but that would incur a delay after every
	 * batch i.e. much more frequent than a delay when waiting for the
	 * interrupt (with the same net latency).
1631 1632 1633
	 *
	 * Also note that to prevent whole machine hangs on gen7, we have to
	 * take the spinlock to guard against concurrent cacheline access.
1634
	 */
1635
	spin_lock_irq(&dev_priv->uncore.lock);
1636
	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
1637
	spin_unlock_irq(&dev_priv->uncore.lock);
1638 1639
}

1640 1641
static void
gen5_irq_enable(struct intel_engine_cs *engine)
1642
{
1643
	gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1644 1645 1646
}

static void
1647
gen5_irq_disable(struct intel_engine_cs *engine)
1648
{
1649
	gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1650 1651
}

1652 1653
static void
i9xx_irq_enable(struct intel_engine_cs *engine)
1654
{
1655
	struct drm_i915_private *dev_priv = engine->i915;
1656

1657 1658 1659
	dev_priv->irq_mask &= ~engine->irq_enable_mask;
	I915_WRITE(IMR, dev_priv->irq_mask);
	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1660 1661
}

1662
static void
1663
i9xx_irq_disable(struct intel_engine_cs *engine)
1664
{
1665
	struct drm_i915_private *dev_priv = engine->i915;
1666

1667 1668
	dev_priv->irq_mask |= engine->irq_enable_mask;
	I915_WRITE(IMR, dev_priv->irq_mask);
1669 1670
}

1671 1672
static void
i8xx_irq_enable(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1673
{
1674
	struct drm_i915_private *dev_priv = engine->i915;
C
Chris Wilson 已提交
1675

1676 1677 1678
	dev_priv->irq_mask &= ~engine->irq_enable_mask;
	I915_WRITE16(IMR, dev_priv->irq_mask);
	POSTING_READ16(RING_IMR(engine->mmio_base));
C
Chris Wilson 已提交
1679 1680 1681
}

static void
1682
i8xx_irq_disable(struct intel_engine_cs *engine)
C
Chris Wilson 已提交
1683
{
1684
	struct drm_i915_private *dev_priv = engine->i915;
C
Chris Wilson 已提交
1685

1686 1687
	dev_priv->irq_mask |= engine->irq_enable_mask;
	I915_WRITE16(IMR, dev_priv->irq_mask);
C
Chris Wilson 已提交
1688 1689
}

1690
static int
1691
bsd_ring_flush(struct drm_i915_gem_request *req,
1692 1693
	       u32     invalidate_domains,
	       u32     flush_domains)
1694
{
1695
	struct intel_engine_cs *engine = req->engine;
1696 1697
	int ret;

1698
	ret = intel_ring_begin(req, 2);
1699 1700 1701
	if (ret)
		return ret;

1702 1703 1704
	intel_ring_emit(engine, MI_FLUSH);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1705
	return 0;
1706 1707
}

1708
static int
1709
i9xx_add_request(struct drm_i915_gem_request *req)
1710
{
1711
	struct intel_engine_cs *engine = req->engine;
1712 1713
	int ret;

1714
	ret = intel_ring_begin(req, 4);
1715 1716
	if (ret)
		return ret;
1717

1718 1719 1720
	intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
	intel_ring_emit(engine,
			I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1721
	intel_ring_emit(engine, req->fence.seqno);
1722 1723
	intel_ring_emit(engine, MI_USER_INTERRUPT);
	__intel_ring_advance(engine);
1724

1725
	return 0;
1726 1727
}

1728 1729
static void
gen6_irq_enable(struct intel_engine_cs *engine)
1730
{
1731
	struct drm_i915_private *dev_priv = engine->i915;
1732

1733 1734 1735
	I915_WRITE_IMR(engine,
		       ~(engine->irq_enable_mask |
			 engine->irq_keep_mask));
1736
	gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1737 1738 1739
}

static void
1740
gen6_irq_disable(struct intel_engine_cs *engine)
1741
{
1742
	struct drm_i915_private *dev_priv = engine->i915;
1743

1744
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1745
	gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1746 1747
}

1748 1749
static void
hsw_vebox_irq_enable(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1750
{
1751
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1752

1753 1754
	I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
	gen6_enable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1755 1756 1757
}

static void
1758
hsw_vebox_irq_disable(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
1759
{
1760
	struct drm_i915_private *dev_priv = engine->i915;
B
Ben Widawsky 已提交
1761

1762 1763
	I915_WRITE_IMR(engine, ~0);
	gen6_disable_pm_irq(dev_priv, engine->irq_enable_mask);
B
Ben Widawsky 已提交
1764 1765
}

1766 1767
static void
gen8_irq_enable(struct intel_engine_cs *engine)
1768
{
1769
	struct drm_i915_private *dev_priv = engine->i915;
1770

1771 1772 1773
	I915_WRITE_IMR(engine,
		       ~(engine->irq_enable_mask |
			 engine->irq_keep_mask));
1774
	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1775 1776 1777
}

static void
1778
gen8_irq_disable(struct intel_engine_cs *engine)
1779
{
1780
	struct drm_i915_private *dev_priv = engine->i915;
1781

1782
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1783 1784
}

1785
static int
1786
i965_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1787
			 u64 offset, u32 length,
1788
			 unsigned dispatch_flags)
1789
{
1790
	struct intel_engine_cs *engine = req->engine;
1791
	int ret;
1792

1793
	ret = intel_ring_begin(req, 2);
1794 1795 1796
	if (ret)
		return ret;

1797
	intel_ring_emit(engine,
1798 1799
			MI_BATCH_BUFFER_START |
			MI_BATCH_GTT |
1800 1801
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
1802 1803
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
1804

1805 1806 1807
	return 0;
}

1808 1809
/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
#define I830_BATCH_LIMIT (256*1024)
1810 1811
#define I830_TLB_ENTRIES (2)
#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1812
static int
1813
i830_dispatch_execbuffer(struct drm_i915_gem_request *req,
1814 1815
			 u64 offset, u32 len,
			 unsigned dispatch_flags)
1816
{
1817
	struct intel_engine_cs *engine = req->engine;
1818
	u32 cs_offset = engine->scratch.gtt_offset;
1819
	int ret;
1820

1821
	ret = intel_ring_begin(req, 6);
1822 1823
	if (ret)
		return ret;
1824

1825
	/* Evict the invalid PTE TLBs */
1826 1827 1828 1829 1830 1831 1832
	intel_ring_emit(engine, COLOR_BLT_CMD | BLT_WRITE_RGBA);
	intel_ring_emit(engine, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
	intel_ring_emit(engine, I830_TLB_ENTRIES << 16 | 4); /* load each page */
	intel_ring_emit(engine, cs_offset);
	intel_ring_emit(engine, 0xdeadbeef);
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
1833

1834
	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1835 1836 1837
		if (len > I830_BATCH_LIMIT)
			return -ENOSPC;

1838
		ret = intel_ring_begin(req, 6 + 2);
1839 1840
		if (ret)
			return ret;
1841 1842 1843 1844 1845

		/* Blit the batch (which has now all relocs applied) to the
		 * stable batch scratch bo area (so that the CS never
		 * stumbles over its tlb invalidation bug) ...
		 */
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
		intel_ring_emit(engine, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
		intel_ring_emit(engine,
				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
		intel_ring_emit(engine, DIV_ROUND_UP(len, 4096) << 16 | 4096);
		intel_ring_emit(engine, cs_offset);
		intel_ring_emit(engine, 4096);
		intel_ring_emit(engine, offset);

		intel_ring_emit(engine, MI_FLUSH);
		intel_ring_emit(engine, MI_NOOP);
		intel_ring_advance(engine);
1857 1858

		/* ... and execute it. */
1859
		offset = cs_offset;
1860
	}
1861

1862
	ret = intel_ring_begin(req, 2);
1863 1864 1865
	if (ret)
		return ret;

1866 1867 1868 1869
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1870

1871 1872 1873 1874
	return 0;
}

static int
1875
i915_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
1876
			 u64 offset, u32 len,
1877
			 unsigned dispatch_flags)
1878
{
1879
	struct intel_engine_cs *engine = req->engine;
1880 1881
	int ret;

1882
	ret = intel_ring_begin(req, 2);
1883 1884 1885
	if (ret)
		return ret;

1886 1887 1888 1889
	intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
	intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
					  0 : MI_BATCH_NON_SECURE));
	intel_ring_advance(engine);
1890 1891 1892 1893

	return 0;
}

1894
static void cleanup_phys_status_page(struct intel_engine_cs *engine)
1895
{
1896
	struct drm_i915_private *dev_priv = engine->i915;
1897 1898 1899 1900

	if (!dev_priv->status_page_dmah)
		return;

1901
	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
1902
	engine->status_page.page_addr = NULL;
1903 1904
}

1905
static void cleanup_status_page(struct intel_engine_cs *engine)
1906
{
1907
	struct drm_i915_gem_object *obj;
1908

1909
	obj = engine->status_page.obj;
1910
	if (obj == NULL)
1911 1912
		return;

1913
	kunmap(sg_page(obj->pages->sgl));
B
Ben Widawsky 已提交
1914
	i915_gem_object_ggtt_unpin(obj);
1915
	i915_gem_object_put(obj);
1916
	engine->status_page.obj = NULL;
1917 1918
}

1919
static int init_status_page(struct intel_engine_cs *engine)
1920
{
1921
	struct drm_i915_gem_object *obj = engine->status_page.obj;
1922

1923
	if (obj == NULL) {
1924
		unsigned flags;
1925
		int ret;
1926

1927
		obj = i915_gem_object_create(&engine->i915->drm, 4096);
1928
		if (IS_ERR(obj)) {
1929
			DRM_ERROR("Failed to allocate status page\n");
1930
			return PTR_ERR(obj);
1931
		}
1932

1933 1934 1935 1936
		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
		if (ret)
			goto err_unref;

1937
		flags = 0;
1938
		if (!HAS_LLC(engine->i915))
1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950
			/* On g33, we cannot place HWS above 256MiB, so
			 * restrict its pinning to the low mappable arena.
			 * Though this restriction is not documented for
			 * gen4, gen5, or byt, they also behave similarly
			 * and hang if the HWS is placed at the top of the
			 * GTT. To generalise, it appears that all !llc
			 * platforms have issues with us placing the HWS
			 * above the mappable region (even though we never
			 * actualy map it).
			 */
			flags |= PIN_MAPPABLE;
		ret = i915_gem_obj_ggtt_pin(obj, 4096, flags);
1951 1952
		if (ret) {
err_unref:
1953
			i915_gem_object_put(obj);
1954 1955 1956
			return ret;
		}

1957
		engine->status_page.obj = obj;
1958
	}
1959

1960 1961 1962
	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(obj);
	engine->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
1963

1964
	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
1965
			engine->name, engine->status_page.gfx_addr);
1966 1967 1968 1969

	return 0;
}

1970
static int init_phys_status_page(struct intel_engine_cs *engine)
1971
{
1972
	struct drm_i915_private *dev_priv = engine->i915;
1973 1974 1975

	if (!dev_priv->status_page_dmah) {
		dev_priv->status_page_dmah =
1976
			drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
1977 1978 1979 1980
		if (!dev_priv->status_page_dmah)
			return -ENOMEM;
	}

1981 1982
	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
1983 1984 1985 1986

	return 0;
}

1987
void intel_unpin_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
1988
{
1989 1990
	GEM_BUG_ON(!ringbuf->vma);
	GEM_BUG_ON(!ringbuf->vaddr);
1991

1992
	if (HAS_LLC(ringbuf->obj->base.dev) && !ringbuf->obj->stolen)
1993
		i915_gem_object_unpin_map(ringbuf->obj);
1994
	else
1995
		i915_vma_unpin_iomap(ringbuf->vma);
1996
	ringbuf->vaddr = NULL;
1997

1998
	i915_gem_object_ggtt_unpin(ringbuf->obj);
1999
	ringbuf->vma = NULL;
2000 2001
}

2002
int intel_pin_and_map_ringbuffer_obj(struct drm_i915_private *dev_priv,
2003 2004 2005
				     struct intel_ringbuffer *ringbuf)
{
	struct drm_i915_gem_object *obj = ringbuf->obj;
2006 2007
	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
	unsigned flags = PIN_OFFSET_BIAS | 4096;
2008
	void *addr;
2009 2010
	int ret;

2011
	if (HAS_LLC(dev_priv) && !obj->stolen) {
2012
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE, flags);
2013 2014
		if (ret)
			return ret;
2015

2016
		ret = i915_gem_object_set_to_cpu_domain(obj, true);
2017 2018
		if (ret)
			goto err_unpin;
2019

2020 2021 2022
		addr = i915_gem_object_pin_map(obj);
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2023
			goto err_unpin;
2024 2025
		}
	} else {
2026 2027
		ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE,
					    flags | PIN_MAPPABLE);
2028 2029
		if (ret)
			return ret;
2030

2031
		ret = i915_gem_object_set_to_gtt_domain(obj, true);
2032 2033
		if (ret)
			goto err_unpin;
2034

2035 2036 2037
		/* Access through the GTT requires the device to be awake. */
		assert_rpm_wakelock_held(dev_priv);

2038 2039
		addr = (void __force *)
			i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
2040 2041
		if (IS_ERR(addr)) {
			ret = PTR_ERR(addr);
2042
			goto err_unpin;
2043
		}
2044 2045
	}

2046
	ringbuf->vaddr = addr;
2047
	ringbuf->vma = i915_gem_obj_to_ggtt(obj);
2048
	return 0;
2049 2050 2051 2052

err_unpin:
	i915_gem_object_ggtt_unpin(obj);
	return ret;
2053 2054
}

2055
static void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2056
{
2057
	i915_gem_object_put(ringbuf->obj);
2058 2059 2060
	ringbuf->obj = NULL;
}

2061 2062
static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
				      struct intel_ringbuffer *ringbuf)
2063
{
2064
	struct drm_i915_gem_object *obj;
2065

2066 2067
	obj = NULL;
	if (!HAS_LLC(dev))
2068
		obj = i915_gem_object_create_stolen(dev, ringbuf->size);
2069
	if (obj == NULL)
2070
		obj = i915_gem_object_create(dev, ringbuf->size);
2071 2072
	if (IS_ERR(obj))
		return PTR_ERR(obj);
2073

2074 2075 2076
	/* mark ring buffers as read-only from GPU side by default */
	obj->gt_ro = 1;

2077
	ringbuf->obj = obj;
2078

2079
	return 0;
2080 2081
}

2082 2083 2084 2085 2086 2087 2088
struct intel_ringbuffer *
intel_engine_create_ringbuffer(struct intel_engine_cs *engine, int size)
{
	struct intel_ringbuffer *ring;
	int ret;

	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
2089 2090 2091
	if (ring == NULL) {
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n",
				 engine->name);
2092
		return ERR_PTR(-ENOMEM);
2093
	}
2094

2095
	ring->engine = engine;
2096
	list_add(&ring->link, &engine->buffers);
2097 2098 2099 2100 2101 2102 2103

	ring->size = size;
	/* Workaround an erratum on the i830 which causes a hang if
	 * the TAIL pointer points to within the last 2 cachelines
	 * of the buffer.
	 */
	ring->effective_size = size;
2104
	if (IS_I830(engine->i915) || IS_845G(engine->i915))
2105 2106 2107 2108 2109
		ring->effective_size -= 2 * CACHELINE_BYTES;

	ring->last_retired_head = -1;
	intel_ring_update_space(ring);

2110
	ret = intel_alloc_ringbuffer_obj(&engine->i915->drm, ring);
2111
	if (ret) {
2112 2113 2114
		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s: %d\n",
				 engine->name, ret);
		list_del(&ring->link);
2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
		kfree(ring);
		return ERR_PTR(ret);
	}

	return ring;
}

void
intel_ringbuffer_free(struct intel_ringbuffer *ring)
{
	intel_destroy_ringbuffer_obj(ring);
2126
	list_del(&ring->link);
2127 2128 2129
	kfree(ring);
}

2130 2131 2132 2133 2134 2135
static int intel_ring_context_pin(struct i915_gem_context *ctx,
				  struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];
	int ret;

2136
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
2137 2138 2139 2140 2141 2142 2143 2144 2145 2146

	if (ce->pin_count++)
		return 0;

	if (ce->state) {
		ret = i915_gem_obj_ggtt_pin(ce->state, ctx->ggtt_alignment, 0);
		if (ret)
			goto error;
	}

2147 2148 2149 2150 2151 2152 2153 2154 2155 2156
	/* The kernel context is only used as a placeholder for flushing the
	 * active context. It is never used for submitting user rendering and
	 * as such never requires the golden render context, and so we can skip
	 * emitting it when we switch to the kernel context. This is required
	 * as during eviction we cannot allocate and pin the renderstate in
	 * order to initialise the context.
	 */
	if (ctx == ctx->i915->kernel_context)
		ce->initialised = true;

2157
	i915_gem_context_get(ctx);
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169
	return 0;

error:
	ce->pin_count = 0;
	return ret;
}

static void intel_ring_context_unpin(struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine)
{
	struct intel_context *ce = &ctx->engine[engine->id];

2170
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
2171 2172 2173 2174 2175 2176 2177

	if (--ce->pin_count)
		return;

	if (ce->state)
		i915_gem_object_ggtt_unpin(ce->state);

2178
	i915_gem_context_put(ctx);
2179 2180
}

2181
static int intel_init_ring_buffer(struct intel_engine_cs *engine)
2182
{
2183
	struct drm_i915_private *dev_priv = engine->i915;
2184
	struct intel_ringbuffer *ringbuf;
2185 2186
	int ret;

2187
	WARN_ON(engine->buffer);
2188

2189 2190
	intel_engine_setup_common(engine);

2191 2192
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2193

2194
	ret = intel_engine_init_common(engine);
2195 2196
	if (ret)
		goto error;
2197

2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208
	/* We may need to do things with the shrinker which
	 * require us to immediately switch back to the default
	 * context. This can cause a problem as pinning the
	 * default context also requires GTT space which may not
	 * be available. To avoid this we always pin the default
	 * context.
	 */
	ret = intel_ring_context_pin(dev_priv->kernel_context, engine);
	if (ret)
		goto error;

2209
	ringbuf = intel_engine_create_ringbuffer(engine, 32 * PAGE_SIZE);
2210 2211 2212 2213
	if (IS_ERR(ringbuf)) {
		ret = PTR_ERR(ringbuf);
		goto error;
	}
2214
	engine->buffer = ringbuf;
2215

2216
	if (I915_NEED_GFX_HWS(dev_priv)) {
2217
		ret = init_status_page(engine);
2218
		if (ret)
2219
			goto error;
2220
	} else {
2221 2222
		WARN_ON(engine->id != RCS);
		ret = init_phys_status_page(engine);
2223
		if (ret)
2224
			goto error;
2225 2226
	}

2227
	ret = intel_pin_and_map_ringbuffer_obj(dev_priv, ringbuf);
2228 2229
	if (ret) {
		DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
2230
				engine->name, ret);
2231 2232
		intel_destroy_ringbuffer_obj(ringbuf);
		goto error;
2233
	}
2234

2235
	return 0;
2236

2237
error:
2238
	intel_cleanup_engine(engine);
2239
	return ret;
2240 2241
}

2242
void intel_cleanup_engine(struct intel_engine_cs *engine)
2243
{
2244
	struct drm_i915_private *dev_priv;
2245

2246
	if (!intel_engine_initialized(engine))
2247 2248
		return;

2249
	dev_priv = engine->i915;
2250

2251
	if (engine->buffer) {
2252
		intel_stop_engine(engine);
2253
		WARN_ON(!IS_GEN2(dev_priv) && (I915_READ_MODE(engine) & MODE_IDLE) == 0);
2254

2255 2256 2257
		intel_unpin_ringbuffer_obj(engine->buffer);
		intel_ringbuffer_free(engine->buffer);
		engine->buffer = NULL;
2258
	}
2259

2260 2261
	if (engine->cleanup)
		engine->cleanup(engine);
Z
Zou Nan hai 已提交
2262

2263
	if (I915_NEED_GFX_HWS(dev_priv)) {
2264
		cleanup_status_page(engine);
2265
	} else {
2266 2267
		WARN_ON(engine->id != RCS);
		cleanup_phys_status_page(engine);
2268
	}
2269

2270 2271
	i915_cmd_parser_fini_ring(engine);
	i915_gem_batch_pool_fini(&engine->batch_pool);
2272
	intel_engine_fini_breadcrumbs(engine);
2273 2274 2275

	intel_ring_context_unpin(dev_priv->kernel_context, engine);

2276
	engine->i915 = NULL;
2277 2278
}

2279
int intel_engine_idle(struct intel_engine_cs *engine)
2280
{
2281
	struct drm_i915_gem_request *req;
2282 2283

	/* Wait upon the last request to be completed */
2284
	if (list_empty(&engine->request_list))
2285 2286
		return 0;

2287 2288 2289
	req = list_entry(engine->request_list.prev,
			 struct drm_i915_gem_request,
			 list);
2290 2291 2292

	/* Make sure we do not trigger any retires */
	return __i915_wait_request(req,
2293
				   req->i915->mm.interruptible,
2294
				   NULL, NULL);
2295 2296
}

2297
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
2298
{
2299 2300 2301 2302 2303 2304
	int ret;

	/* Flush enough space to reduce the likelihood of waiting after
	 * we start building the request - in which case we will just
	 * have to repeat work.
	 */
2305
	request->reserved_space += LEGACY_REQUEST_SIZE;
2306

2307
	request->ringbuf = request->engine->buffer;
2308 2309 2310 2311 2312

	ret = intel_ring_begin(request, 0);
	if (ret)
		return ret;

2313
	request->reserved_space -= LEGACY_REQUEST_SIZE;
2314
	return 0;
2315 2316
}

2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335
static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
{
	struct intel_ringbuffer *ringbuf = req->ringbuf;
	struct intel_engine_cs *engine = req->engine;
	struct drm_i915_gem_request *target;

	intel_ring_update_space(ringbuf);
	if (ringbuf->space >= bytes)
		return 0;

	/*
	 * Space is reserved in the ringbuffer for finalising the request,
	 * as that cannot be allowed to fail. During request finalisation,
	 * reserved_space is set to 0 to stop the overallocation and the
	 * assumption is that then we never need to wait (which has the
	 * risk of failing with EINTR).
	 *
	 * See also i915_gem_request_alloc() and i915_add_request().
	 */
2336
	GEM_BUG_ON(!req->reserved_space);
2337 2338 2339 2340

	list_for_each_entry(target, &engine->request_list, list) {
		unsigned space;

2341
		/*
2342 2343 2344
		 * The request queue is per-engine, so can contain requests
		 * from multiple ringbuffers. Here, we must ignore any that
		 * aren't from the ringbuffer we're considering.
2345
		 */
2346 2347 2348 2349 2350 2351 2352 2353
		if (target->ringbuf != ringbuf)
			continue;

		/* Would completion of this request free enough space? */
		space = __intel_ring_space(target->postfix, ringbuf->tail,
					   ringbuf->size);
		if (space >= bytes)
			break;
2354
	}
2355

2356 2357 2358 2359
	if (WARN_ON(&target->list == &engine->request_list))
		return -ENOSPC;

	return i915_wait_request(target);
2360 2361
}

2362
int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
M
Mika Kuoppala 已提交
2363
{
2364
	struct intel_ringbuffer *ringbuf = req->ringbuf;
2365
	int remain_actual = ringbuf->size - ringbuf->tail;
2366 2367 2368
	int remain_usable = ringbuf->effective_size - ringbuf->tail;
	int bytes = num_dwords * sizeof(u32);
	int total_bytes, wait_bytes;
2369
	bool need_wrap = false;
2370

2371
	total_bytes = bytes + req->reserved_space;
2372

2373 2374 2375 2376 2377 2378 2379
	if (unlikely(bytes > remain_usable)) {
		/*
		 * Not enough space for the basic request. So need to flush
		 * out the remainder and then wait for base + reserved.
		 */
		wait_bytes = remain_actual + total_bytes;
		need_wrap = true;
2380 2381 2382 2383 2384 2385 2386
	} else if (unlikely(total_bytes > remain_usable)) {
		/*
		 * The base request will fit but the reserved space
		 * falls off the end. So we don't need an immediate wrap
		 * and only need to effectively wait for the reserved
		 * size space from the start of ringbuffer.
		 */
2387
		wait_bytes = remain_actual + req->reserved_space;
2388
	} else {
2389 2390
		/* No wrapping required, just waiting. */
		wait_bytes = total_bytes;
M
Mika Kuoppala 已提交
2391 2392
	}

2393 2394
	if (wait_bytes > ringbuf->space) {
		int ret = wait_for_space(req, wait_bytes);
M
Mika Kuoppala 已提交
2395 2396
		if (unlikely(ret))
			return ret;
2397

2398
		intel_ring_update_space(ringbuf);
2399 2400
		if (unlikely(ringbuf->space < wait_bytes))
			return -EAGAIN;
M
Mika Kuoppala 已提交
2401 2402
	}

2403 2404 2405
	if (unlikely(need_wrap)) {
		GEM_BUG_ON(remain_actual > ringbuf->space);
		GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
2406

2407
		/* Fill the tail with MI_NOOP */
2408
		memset(ringbuf->vaddr + ringbuf->tail, 0, remain_actual);
2409 2410 2411
		ringbuf->tail = 0;
		ringbuf->space -= remain_actual;
	}
2412

2413 2414
	ringbuf->space -= bytes;
	GEM_BUG_ON(ringbuf->space < 0);
2415
	return 0;
2416
}
2417

2418
/* Align the ring tail to a cacheline boundary */
2419
int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
2420
{
2421
	struct intel_engine_cs *engine = req->engine;
2422
	int num_dwords = (engine->buffer->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
2423 2424 2425 2426 2427
	int ret;

	if (num_dwords == 0)
		return 0;

2428
	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
2429
	ret = intel_ring_begin(req, num_dwords);
2430 2431 2432 2433
	if (ret)
		return ret;

	while (num_dwords--)
2434
		intel_ring_emit(engine, MI_NOOP);
2435

2436
	intel_ring_advance(engine);
2437 2438 2439 2440

	return 0;
}

2441
void intel_ring_init_seqno(struct intel_engine_cs *engine, u32 seqno)
2442
{
2443
	struct drm_i915_private *dev_priv = engine->i915;
2444

2445 2446 2447 2448 2449 2450 2451 2452
	/* Our semaphore implementation is strictly monotonic (i.e. we proceed
	 * so long as the semaphore value in the register/page is greater
	 * than the sync value), so whenever we reset the seqno,
	 * so long as we reset the tracking semaphore value to 0, it will
	 * always be before the next request's seqno. If we don't reset
	 * the semaphore value, then when the seqno moves backwards all
	 * future waits will complete instantly (causing rendering corruption).
	 */
2453
	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
2454 2455
		I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
		I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
2456
		if (HAS_VEBOX(dev_priv))
2457
			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
2458
	}
2459 2460 2461 2462 2463 2464 2465 2466
	if (dev_priv->semaphore_obj) {
		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
		void *semaphores = kmap(page);
		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
		kunmap(page);
	}
2467 2468
	memset(engine->semaphore.sync_seqno, 0,
	       sizeof(engine->semaphore.sync_seqno));
2469

2470 2471 2472
	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
	if (engine->irq_seqno_barrier)
		engine->irq_seqno_barrier(engine);
2473
	engine->last_submitted_seqno = seqno;
2474

2475
	engine->hangcheck.seqno = seqno;
2476 2477 2478 2479 2480 2481 2482

	/* After manually advancing the seqno, fake the interrupt in case
	 * there are any waiters for that seqno.
	 */
	rcu_read_lock();
	intel_engine_wakeup(engine);
	rcu_read_unlock();
2483
}
2484

2485
static void gen6_bsd_ring_write_tail(struct intel_engine_cs *engine,
2486
				     u32 value)
2487
{
2488
	struct drm_i915_private *dev_priv = engine->i915;
2489

2490 2491
	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);

2492
       /* Every tail move must follow the sequence below */
2493 2494 2495 2496

	/* Disable notification that the ring is IDLE. The GT
	 * will then assume that it is busy and bring it out of rc6.
	 */
2497 2498
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2499 2500

	/* Clear the context id. Here be magic! */
2501
	I915_WRITE64_FW(GEN6_BSD_RNCID, 0x0);
2502

2503
	/* Wait for the ring not to be idle, i.e. for it to wake up. */
2504 2505 2506 2507 2508
	if (intel_wait_for_register_fw(dev_priv,
				       GEN6_BSD_SLEEP_PSMI_CONTROL,
				       GEN6_BSD_SLEEP_INDICATOR,
				       0,
				       50))
2509
		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
2510

2511
	/* Now that the ring is fully powered up, update the tail */
2512 2513
	I915_WRITE_FW(RING_TAIL(engine->mmio_base), value);
	POSTING_READ_FW(RING_TAIL(engine->mmio_base));
2514 2515 2516 2517

	/* Let the ring send IDLE messages to the GT again,
	 * and so let it sleep to conserve power when idle.
	 */
2518 2519 2520 2521
	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
		      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));

	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
2522 2523
}

2524
static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req,
2525
			       u32 invalidate, u32 flush)
2526
{
2527
	struct intel_engine_cs *engine = req->engine;
2528
	uint32_t cmd;
2529 2530
	int ret;

2531
	ret = intel_ring_begin(req, 4);
2532 2533 2534
	if (ret)
		return ret;

2535
	cmd = MI_FLUSH_DW;
2536
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2537
		cmd += 1;
2538 2539 2540 2541 2542 2543 2544 2545

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2546 2547 2548 2549 2550 2551
	/*
	 * Bspec vol 1c.5 - video engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2552
	if (invalidate & I915_GEM_GPU_DOMAINS)
2553 2554
		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;

2555 2556 2557
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2558
	if (INTEL_GEN(req->i915) >= 8) {
2559 2560
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2561
	} else  {
2562 2563
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2564
	}
2565
	intel_ring_advance(engine);
2566
	return 0;
2567 2568
}

2569
static int
2570
gen8_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2571
			      u64 offset, u32 len,
2572
			      unsigned dispatch_flags)
2573
{
2574
	struct intel_engine_cs *engine = req->engine;
2575
	bool ppgtt = USES_PPGTT(engine->dev) &&
2576
			!(dispatch_flags & I915_DISPATCH_SECURE);
2577 2578
	int ret;

2579
	ret = intel_ring_begin(req, 4);
2580 2581 2582 2583
	if (ret)
		return ret;

	/* FIXME(BDW): Address space and security selectors. */
2584
	intel_ring_emit(engine, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
2585 2586
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2587 2588 2589 2590
	intel_ring_emit(engine, lower_32_bits(offset));
	intel_ring_emit(engine, upper_32_bits(offset));
	intel_ring_emit(engine, MI_NOOP);
	intel_ring_advance(engine);
2591 2592 2593 2594

	return 0;
}

2595
static int
2596
hsw_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2597 2598
			     u64 offset, u32 len,
			     unsigned dispatch_flags)
2599
{
2600
	struct intel_engine_cs *engine = req->engine;
2601 2602
	int ret;

2603
	ret = intel_ring_begin(req, 2);
2604 2605 2606
	if (ret)
		return ret;

2607
	intel_ring_emit(engine,
2608
			MI_BATCH_BUFFER_START |
2609
			(dispatch_flags & I915_DISPATCH_SECURE ?
2610 2611 2612
			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
			(dispatch_flags & I915_DISPATCH_RS ?
			 MI_BATCH_RESOURCE_STREAMER : 0));
2613
	/* bit0-7 is the length on GEN6+ */
2614 2615
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2616 2617 2618 2619

	return 0;
}

2620
static int
2621
gen6_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
B
Ben Widawsky 已提交
2622
			      u64 offset, u32 len,
2623
			      unsigned dispatch_flags)
2624
{
2625
	struct intel_engine_cs *engine = req->engine;
2626
	int ret;
2627

2628
	ret = intel_ring_begin(req, 2);
2629 2630
	if (ret)
		return ret;
2631

2632
	intel_ring_emit(engine,
2633
			MI_BATCH_BUFFER_START |
2634 2635
			(dispatch_flags & I915_DISPATCH_SECURE ?
			 0 : MI_BATCH_NON_SECURE_I965));
2636
	/* bit0-7 is the length on GEN6+ */
2637 2638
	intel_ring_emit(engine, offset);
	intel_ring_advance(engine);
2639

2640
	return 0;
2641 2642
}

2643 2644
/* Blitter support (SandyBridge+) */

2645
static int gen6_ring_flush(struct drm_i915_gem_request *req,
2646
			   u32 invalidate, u32 flush)
Z
Zou Nan hai 已提交
2647
{
2648
	struct intel_engine_cs *engine = req->engine;
2649
	uint32_t cmd;
2650 2651
	int ret;

2652
	ret = intel_ring_begin(req, 4);
2653 2654 2655
	if (ret)
		return ret;

2656
	cmd = MI_FLUSH_DW;
2657
	if (INTEL_GEN(req->i915) >= 8)
B
Ben Widawsky 已提交
2658
		cmd += 1;
2659 2660 2661 2662 2663 2664 2665 2666

	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

2667 2668 2669 2670 2671 2672
	/*
	 * Bspec vol 1c.3 - blitter engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
2673
	if (invalidate & I915_GEM_DOMAIN_RENDER)
2674
		cmd |= MI_INVALIDATE_TLB;
2675 2676 2677
	intel_ring_emit(engine, cmd);
	intel_ring_emit(engine,
			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2678
	if (INTEL_GEN(req->i915) >= 8) {
2679 2680
		intel_ring_emit(engine, 0); /* upper addr */
		intel_ring_emit(engine, 0); /* value */
B
Ben Widawsky 已提交
2681
	} else  {
2682 2683
		intel_ring_emit(engine, 0);
		intel_ring_emit(engine, MI_NOOP);
B
Ben Widawsky 已提交
2684
	}
2685
	intel_ring_advance(engine);
R
Rodrigo Vivi 已提交
2686

2687
	return 0;
Z
Zou Nan hai 已提交
2688 2689
}

2690 2691 2692
static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
				       struct intel_engine_cs *engine)
{
2693
	struct drm_i915_gem_object *obj;
2694
	int ret, i;
2695

2696
	if (!i915.semaphores)
2697 2698 2699
		return;

	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore_obj) {
2700
		obj = i915_gem_object_create(&dev_priv->drm, 4096);
2701 2702 2703 2704 2705 2706 2707
		if (IS_ERR(obj)) {
			DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
			i915.semaphores = 0;
		} else {
			i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
			ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
			if (ret != 0) {
2708
				i915_gem_object_put(obj);
2709 2710 2711 2712 2713 2714 2715 2716
				DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
				i915.semaphores = 0;
			} else {
				dev_priv->semaphore_obj = obj;
			}
		}
	}

2717
	if (!i915.semaphores)
2718 2719 2720
		return;

	if (INTEL_GEN(dev_priv) >= 8) {
2721 2722
		u64 offset = i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj);

2723 2724
		engine->semaphore.sync_to = gen8_ring_sync;
		engine->semaphore.signal = gen8_xcs_signal;
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735

		for (i = 0; i < I915_NUM_ENGINES; i++) {
			u64 ring_offset;

			if (i != engine->id)
				ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i);
			else
				ring_offset = MI_SEMAPHORE_SYNC_INVALID;

			engine->semaphore.signal_ggtt[i] = ring_offset;
		}
2736 2737 2738
	} else if (INTEL_GEN(dev_priv) >= 6) {
		engine->semaphore.sync_to = gen6_ring_sync;
		engine->semaphore.signal = gen6_signal;
2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786

		/*
		 * The current semaphore is only applied on pre-gen8
		 * platform.  And there is no VCS2 ring on the pre-gen8
		 * platform. So the semaphore between RCS and VCS2 is
		 * initialized as INVALID.  Gen8 will initialize the
		 * sema between VCS2 and RCS later.
		 */
		for (i = 0; i < I915_NUM_ENGINES; i++) {
			static const struct {
				u32 wait_mbox;
				i915_reg_t mbox_reg;
			} sem_data[I915_NUM_ENGINES][I915_NUM_ENGINES] = {
				[RCS] = {
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
				},
				[VCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
				},
				[BCS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
				},
				[VECS] = {
					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
				},
			};
			u32 wait_mbox;
			i915_reg_t mbox_reg;

			if (i == engine->id || i == VCS2) {
				wait_mbox = MI_SEMAPHORE_SYNC_INVALID;
				mbox_reg = GEN6_NOSYNC;
			} else {
				wait_mbox = sem_data[engine->id][i].wait_mbox;
				mbox_reg = sem_data[engine->id][i].mbox_reg;
			}

			engine->semaphore.mbox.wait[i] = wait_mbox;
			engine->semaphore.mbox.signal[i] = mbox_reg;
		}
2787 2788 2789
	}
}

2790 2791 2792
static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
				struct intel_engine_cs *engine)
{
2793 2794
	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << engine->irq_shift;

2795
	if (INTEL_GEN(dev_priv) >= 8) {
2796 2797
		engine->irq_enable = gen8_irq_enable;
		engine->irq_disable = gen8_irq_disable;
2798 2799
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 6) {
2800 2801
		engine->irq_enable = gen6_irq_enable;
		engine->irq_disable = gen6_irq_disable;
2802 2803
		engine->irq_seqno_barrier = gen6_seqno_barrier;
	} else if (INTEL_GEN(dev_priv) >= 5) {
2804 2805
		engine->irq_enable = gen5_irq_enable;
		engine->irq_disable = gen5_irq_disable;
2806
		engine->irq_seqno_barrier = gen5_seqno_barrier;
2807
	} else if (INTEL_GEN(dev_priv) >= 3) {
2808 2809
		engine->irq_enable = i9xx_irq_enable;
		engine->irq_disable = i9xx_irq_disable;
2810
	} else {
2811 2812
		engine->irq_enable = i8xx_irq_enable;
		engine->irq_disable = i8xx_irq_disable;
2813 2814 2815
	}
}

2816 2817 2818
static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
				      struct intel_engine_cs *engine)
{
2819
	engine->init_hw = init_ring_common;
2820
	engine->write_tail = ring_write_tail;
2821

2822 2823
	engine->add_request = i9xx_add_request;
	if (INTEL_GEN(dev_priv) >= 6)
2824
		engine->add_request = gen6_add_request;
2825 2826 2827 2828

	if (INTEL_GEN(dev_priv) >= 8)
		engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
	else if (INTEL_GEN(dev_priv) >= 6)
2829
		engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
2830
	else if (INTEL_GEN(dev_priv) >= 4)
2831
		engine->dispatch_execbuffer = i965_dispatch_execbuffer;
2832 2833 2834 2835
	else if (IS_I830(dev_priv) || IS_845G(dev_priv))
		engine->dispatch_execbuffer = i830_dispatch_execbuffer;
	else
		engine->dispatch_execbuffer = i915_dispatch_execbuffer;
2836

2837
	intel_ring_init_irq(dev_priv, engine);
2838
	intel_ring_init_semaphores(dev_priv, engine);
2839 2840
}

2841
int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
2842
{
2843
	struct drm_i915_private *dev_priv = engine->i915;
2844
	int ret;
2845

2846 2847
	intel_ring_default_vfuncs(dev_priv, engine);

2848 2849
	if (HAS_L3_DPF(dev_priv))
		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2850

2851
	if (INTEL_GEN(dev_priv) >= 8) {
2852
		engine->init_context = intel_rcs_ctx_init;
2853
		engine->add_request = gen8_render_add_request;
2854
		engine->flush = gen8_render_ring_flush;
2855
		if (i915.semaphores)
2856
			engine->semaphore.signal = gen8_rcs_signal;
2857
	} else if (INTEL_GEN(dev_priv) >= 6) {
2858 2859
		engine->init_context = intel_rcs_ctx_init;
		engine->flush = gen7_render_ring_flush;
2860
		if (IS_GEN6(dev_priv))
2861
			engine->flush = gen6_render_ring_flush;
2862
	} else if (IS_GEN5(dev_priv)) {
2863
		engine->flush = gen4_render_ring_flush;
2864
	} else {
2865
		if (INTEL_GEN(dev_priv) < 4)
2866
			engine->flush = gen2_render_ring_flush;
2867
		else
2868 2869
			engine->flush = gen4_render_ring_flush;
		engine->irq_enable_mask = I915_USER_INTERRUPT;
2870
	}
B
Ben Widawsky 已提交
2871

2872
	if (IS_HASWELL(dev_priv))
2873
		engine->dispatch_execbuffer = hsw_ring_dispatch_execbuffer;
2874

2875 2876
	engine->init_hw = init_render_ring;
	engine->cleanup = render_ring_cleanup;
2877

2878
	ret = intel_init_ring_buffer(engine);
2879 2880 2881
	if (ret)
		return ret;

2882
	if (INTEL_GEN(dev_priv) >= 6) {
2883 2884 2885 2886 2887
		ret = intel_init_pipe_control(engine, 4096);
		if (ret)
			return ret;
	} else if (HAS_BROKEN_CS_TLB(dev_priv)) {
		ret = intel_init_pipe_control(engine, I830_WA_SIZE);
2888 2889 2890 2891 2892
		if (ret)
			return ret;
	}

	return 0;
2893 2894
}

2895
int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
2896
{
2897
	struct drm_i915_private *dev_priv = engine->i915;
2898

2899 2900
	intel_ring_default_vfuncs(dev_priv, engine);

2901
	if (INTEL_GEN(dev_priv) >= 6) {
2902
		/* gen6 bsd needs a special wa for tail updates */
2903
		if (IS_GEN6(dev_priv))
2904 2905
			engine->write_tail = gen6_bsd_ring_write_tail;
		engine->flush = gen6_bsd_ring_flush;
2906
		if (INTEL_GEN(dev_priv) < 8)
2907
			engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2908
	} else {
2909 2910
		engine->mmio_base = BSD_RING_BASE;
		engine->flush = bsd_ring_flush;
2911
		if (IS_GEN5(dev_priv))
2912
			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2913
		else
2914
			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2915 2916
	}

2917
	return intel_init_ring_buffer(engine);
2918
}
2919

2920
/**
2921
 * Initialize the second BSD ring (eg. Broadwell GT3, Skylake GT3)
2922
 */
2923
int intel_init_bsd2_ring_buffer(struct intel_engine_cs *engine)
2924
{
2925
	struct drm_i915_private *dev_priv = engine->i915;
2926 2927 2928

	intel_ring_default_vfuncs(dev_priv, engine);

2929
	engine->flush = gen6_bsd_ring_flush;
2930

2931
	return intel_init_ring_buffer(engine);
2932 2933
}

2934
int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
2935
{
2936
	struct drm_i915_private *dev_priv = engine->i915;
2937 2938 2939

	intel_ring_default_vfuncs(dev_priv, engine);

2940
	engine->flush = gen6_ring_flush;
2941
	if (INTEL_GEN(dev_priv) < 8)
2942
		engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2943

2944
	return intel_init_ring_buffer(engine);
2945
}
2946

2947
int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
B
Ben Widawsky 已提交
2948
{
2949
	struct drm_i915_private *dev_priv = engine->i915;
2950 2951 2952

	intel_ring_default_vfuncs(dev_priv, engine);

2953
	engine->flush = gen6_ring_flush;
2954

2955
	if (INTEL_GEN(dev_priv) < 8) {
2956
		engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2957 2958
		engine->irq_enable = hsw_vebox_irq_enable;
		engine->irq_disable = hsw_vebox_irq_disable;
2959
	}
B
Ben Widawsky 已提交
2960

2961
	return intel_init_ring_buffer(engine);
B
Ben Widawsky 已提交
2962 2963
}

2964
int
2965
intel_ring_flush_all_caches(struct drm_i915_gem_request *req)
2966
{
2967
	struct intel_engine_cs *engine = req->engine;
2968 2969
	int ret;

2970
	if (!engine->gpu_caches_dirty)
2971 2972
		return 0;

2973
	ret = engine->flush(req, 0, I915_GEM_GPU_DOMAINS);
2974 2975 2976
	if (ret)
		return ret;

2977
	trace_i915_gem_ring_flush(req, 0, I915_GEM_GPU_DOMAINS);
2978

2979
	engine->gpu_caches_dirty = false;
2980 2981 2982 2983
	return 0;
}

int
2984
intel_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
2985
{
2986
	struct intel_engine_cs *engine = req->engine;
2987 2988 2989 2990
	uint32_t flush_domains;
	int ret;

	flush_domains = 0;
2991
	if (engine->gpu_caches_dirty)
2992 2993
		flush_domains = I915_GEM_GPU_DOMAINS;

2994
	ret = engine->flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
2995 2996 2997
	if (ret)
		return ret;

2998
	trace_i915_gem_ring_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
2999

3000
	engine->gpu_caches_dirty = false;
3001 3002
	return 0;
}
3003 3004

void
3005
intel_stop_engine(struct intel_engine_cs *engine)
3006 3007 3008
{
	int ret;

3009
	if (!intel_engine_initialized(engine))
3010 3011
		return;

3012
	ret = intel_engine_idle(engine);
3013
	if (ret)
3014
		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
3015
			  engine->name, ret);
3016

3017
	stop_ring(engine);
3018
}