From 6ac42f4148bc27e5ffd18a9ab0eac57f58822af4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Sat, 21 Jul 2012 12:25:01 +0200
Subject: [PATCH] drm/i915: Replace the complex flushing logic with simple
 invalidate/flush all

Now that we unconditionally flush and invalidate between every batch
buffer, we no longer need the complex logic to decide which domains
require flushing. Remove it and rejoice.

v2 (danvet): Keep around the flip waiting logic. It's gross and
broken, I know, but we can't just kill that thing ... even if we just
keep it around as a reminder that things are broken.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 222 ++-------------------
 1 file changed, 20 insertions(+), 202 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 36c940c1a978..6c810798de92 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -34,180 +34,6 @@
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
 
-struct change_domains {
-	uint32_t invalidate_domains;
-	uint32_t flush_domains;
-	uint32_t flush_rings;
-	uint32_t flips;
-};
-
-/*
- * Set the next domain for the specified object. This
- * may not actually perform the necessary flushing/invaliding though,
- * as that may want to be batched with other set_domain operations
- *
- * This is (we hope) the only really tricky part of gem. The goal
- * is fairly simple -- track which caches hold bits of the object
- * and make sure they remain coherent. A few concrete examples may
- * help to explain how it works. For shorthand, we use the notation
- * (read_domains, write_domain), e.g. (CPU, CPU) to indicate the
- * a pair of read and write domain masks.
- *
- * Case 1: the batch buffer
- *
- *	1. Allocated
- *	2. Written by CPU
- *	3. Mapped to GTT
- *	4. Read by GPU
- *	5. Unmapped from GTT
- *	6. Freed
- *
- *	Let's take these a step at a time
- *
- *	1. Allocated
- *		Pages allocated from the kernel may still have
- *		cache contents, so we set them to (CPU, CPU) always.
- *	2. Written by CPU (using pwrite)
- *		The pwrite function calls set_domain (CPU, CPU) and
- *		this function does nothing (as nothing changes)
- *	3. Mapped by GTT
- *		This function asserts that the object is not
- *		currently in any GPU-based read or write domains
- *	4. Read by GPU
- *		i915_gem_execbuffer calls set_domain (COMMAND, 0).
- *		As write_domain is zero, this function adds in the
- *		current read domains (CPU+COMMAND, 0).
- *		flush_domains is set to CPU.
- *		invalidate_domains is set to COMMAND
- *		clflush is run to get data out of the CPU caches
- *		then i915_dev_set_domain calls i915_gem_flush to
- *		emit an MI_FLUSH and drm_agp_chipset_flush
- *	5. Unmapped from GTT
- *		i915_gem_object_unbind calls set_domain (CPU, CPU)
- *		flush_domains and invalidate_domains end up both zero
- *		so no flushing/invalidating happens
- *	6. Freed
- *		yay, done
- *
- * Case 2: The shared render buffer
- *
- *	1. Allocated
- *	2. Mapped to GTT
- *	3. Read/written by GPU
- *	4. set_domain to (CPU,CPU)
- *	5. Read/written by CPU
- *	6. Read/written by GPU
- *
- *	1. Allocated
- *		Same as last example, (CPU, CPU)
- *	2. Mapped to GTT
- *		Nothing changes (assertions find that it is not in the GPU)
- *	3. Read/written by GPU
- *		execbuffer calls set_domain (RENDER, RENDER)
- *		flush_domains gets CPU
- *		invalidate_domains gets GPU
- *		clflush (obj)
- *		MI_FLUSH and drm_agp_chipset_flush
- *	4. set_domain (CPU, CPU)
- *		flush_domains gets GPU
- *		invalidate_domains gets CPU
- *		wait_rendering (obj) to make sure all drawing is complete.
- *		This will include an MI_FLUSH to get the data from GPU
- *		to memory
- *		clflush (obj) to invalidate the CPU cache
- *		Another MI_FLUSH in i915_gem_flush (eliminate this somehow?)
- *	5. Read/written by CPU
- *		cache lines are loaded and dirtied
- *	6. Read written by GPU
- *		Same as last GPU access
- *
- * Case 3: The constant buffer
- *
- *	1. Allocated
- *	2. Written by CPU
- *	3. Read by GPU
- *	4. Updated (written) by CPU again
- *	5. Read by GPU
- *
- *	1. Allocated
- *		(CPU, CPU)
- *	2. Written by CPU
- *		(CPU, CPU)
- *	3. Read by GPU
- *		(CPU+RENDER, 0)
- *		flush_domains = CPU
- *		invalidate_domains = RENDER
- *		clflush (obj)
- *		MI_FLUSH
- *		drm_agp_chipset_flush
- *	4. Updated (written) by CPU again
- *		(CPU, CPU)
- *		flush_domains = 0 (no previous write domain)
- *		invalidate_domains = 0 (no new read domains)
- *	5. Read by GPU
- *		(CPU+RENDER, 0)
- *		flush_domains = CPU
- *		invalidate_domains = RENDER
- *		clflush (obj)
- *		MI_FLUSH
- *		drm_agp_chipset_flush
- */
-static void
-i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj,
-				  struct intel_ring_buffer *ring,
-				  struct change_domains *cd)
-{
-	uint32_t invalidate_domains = 0, flush_domains = 0;
-
-	/*
-	 * If the object isn't moving to a new write domain,
-	 * let the object stay in multiple read domains
-	 */
-	if (obj->base.pending_write_domain == 0)
-		obj->base.pending_read_domains |= obj->base.read_domains;
-
-	/*
-	 * Flush the current write domain if
-	 * the new read domains don't match. Invalidate
-	 * any read domains which differ from the old
-	 * write domain
-	 */
-	if (obj->base.write_domain &&
-	    (((obj->base.write_domain != obj->base.pending_read_domains ||
-	       obj->ring != ring)) ||
-	     (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) {
-		flush_domains |= obj->base.write_domain;
-		invalidate_domains |=
-			obj->base.pending_read_domains & ~obj->base.write_domain;
-	}
-	/*
-	 * Invalidate any read caches which may have
-	 * stale data. That is, any new read domains.
-	 */
-	invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains;
-	if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU)
-		i915_gem_clflush_object(obj);
-
-	if (obj->base.pending_write_domain)
-		cd->flips |= atomic_read(&obj->pending_flip);
-
-	/* The actual obj->write_domain will be updated with
-	 * pending_write_domain after we emit the accumulated flush for all
-	 * of our domain changes in execbuffers (which clears objects'
-	 * write_domains).  So if we have a current write domain that we
-	 * aren't changing, set pending_write_domain to that.
-	 */
-	if (flush_domains == 0 && obj->base.pending_write_domain == 0)
-		obj->base.pending_write_domain = obj->base.write_domain;
-
-	cd->invalidate_domains |= invalidate_domains;
-	cd->flush_domains |= flush_domains;
-	if (flush_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= intel_ring_flag(obj->ring);
-	if (invalidate_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= intel_ring_flag(ring);
-}
-
 struct eb_objects {
 	int and;
 	struct hlist_head buckets[0];
@@ -810,18 +636,6 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 	return ret;
 }
 
-static void
-i915_gem_execbuffer_flush(struct drm_device *dev,
-			  uint32_t invalidate_domains,
-			  uint32_t flush_domains)
-{
-	if (flush_domains & I915_GEM_DOMAIN_CPU)
-		intel_gtt_chipset_flush();
-
-	if (flush_domains & I915_GEM_DOMAIN_GTT)
-		wmb();
-}
-
 static int
 i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips)
 {
@@ -854,37 +668,41 @@ i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips)
 	return 0;
 }
 
-
 static int
 i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 				struct list_head *objects)
 {
 	struct drm_i915_gem_object *obj;
-	struct change_domains cd;
+	uint32_t flush_domains = 0;
+	uint32_t flips = 0;
 	int ret;
 
-	memset(&cd, 0, sizeof(cd));
-	list_for_each_entry(obj, objects, exec_list)
-		i915_gem_object_set_to_gpu_domain(obj, ring, &cd);
-
-	if (cd.invalidate_domains | cd.flush_domains) {
-		i915_gem_execbuffer_flush(ring->dev,
-					  cd.invalidate_domains,
-					  cd.flush_domains);
-	}
-
-	if (cd.flips) {
-		ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips);
+	list_for_each_entry(obj, objects, exec_list) {
+		ret = i915_gem_object_sync(obj, ring);
 		if (ret)
 			return ret;
+
+		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+			i915_gem_clflush_object(obj);
+
+		if (obj->base.pending_write_domain)
+			flips |= atomic_read(&obj->pending_flip);
+
+		flush_domains |= obj->base.write_domain;
 	}
 
-	list_for_each_entry(obj, objects, exec_list) {
-		ret = i915_gem_object_sync(obj, ring);
+	if (flips) {
+		ret = i915_gem_execbuffer_wait_for_flips(ring, flips);
 		if (ret)
 			return ret;
 	}
 
+	if (flush_domains & I915_GEM_DOMAIN_CPU)
+		intel_gtt_chipset_flush();
+
+	if (flush_domains & I915_GEM_DOMAIN_GTT)
+		wmb();
+
 	/* Unconditionally invalidate gpu caches and ensure that we do flush
 	 * any residual writes from the previous batch.
 	 */
-- 
GitLab