diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 1f1761fc6597d09ab6446c277fded7aceae96215..5b31e1e05ddd6423d708390adc338583ae1e82a0 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -116,6 +116,7 @@ intel_context_init(struct intel_context *ce, ce->engine = engine; ce->ops = engine->cops; ce->sseu = engine->sseu; + ce->saturated = 0; INIT_LIST_HEAD(&ce->signal_link); INIT_LIST_HEAD(&ce->signals); @@ -158,6 +159,7 @@ void intel_context_enter_engine(struct intel_context *ce) void intel_context_exit_engine(struct intel_context *ce) { + ce->saturated = 0; intel_engine_pm_put(ce->engine); } diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index d5a7dbd0daeeff553b2cfaeab2c0541c4c796964..963a312430e6d37a8a3a627afab750d77df8c550 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -13,6 +13,7 @@ #include #include "i915_active_types.h" +#include "intel_engine_types.h" #include "intel_sseu.h" struct i915_gem_context; @@ -52,6 +53,8 @@ struct intel_context { atomic_t pin_count; struct mutex pin_mutex; /* guards pinning and associated on-gpuing */ + intel_engine_mask_t saturated; /* submitting semaphores too late? */ + /** * active_tracker: Active tracker for the external rq activity * on this intel_context object. diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index d06c45305b03d9af0b7f9ea11e338ed087e6f073..ae9ce46e1b15055e886a4cfce0b661b8332032b5 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -410,6 +410,26 @@ void __i915_request_submit(struct i915_request *request) if (i915_gem_context_is_banned(request->gem_context)) i915_request_skip(request, -EIO); + /* + * Are we using semaphores when the gpu is already saturated? + * + * Using semaphores incurs a cost in having the GPU poll a + * memory location, busywaiting for it to change. The continual + * memory reads can have a noticeable impact on the rest of the + * system with the extra bus traffic, stalling the cpu as it too + * tries to access memory across the bus (perf stat -e bus-cycles). + * + * If we installed a semaphore on this request and we only submit + * the request after the signaler completed, that indicates the + * system is overloaded and using semaphores at this time only + * increases the amount of work we are doing. If so, we disable + * further use of semaphores until we are idle again, whence we + * optimistically try again. + */ + if (request->sched.semaphores && + i915_sw_fence_signaled(&request->semaphore)) + request->hw_context->saturated |= request->sched.semaphores; + /* We may be recursing from the signal callback of another i915 fence */ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); @@ -785,6 +805,24 @@ i915_request_await_start(struct i915_request *rq, struct i915_request *signal) I915_FENCE_GFP); } +static intel_engine_mask_t +already_busywaiting(struct i915_request *rq) +{ + /* + * Polling a semaphore causes bus traffic, delaying other users of + * both the GPU and CPU. We want to limit the impact on others, + * while taking advantage of early submission to reduce GPU + * latency. Therefore we restrict ourselves to not using more + * than one semaphore from each source, and not using a semaphore + * if we have detected the engine is saturated (i.e. would not be + * submitted early and cause bus traffic reading an already passed + * semaphore). + * + * See the are-we-too-late? check in __i915_request_submit(). + */ + return rq->sched.semaphores | rq->hw_context->saturated; +} + static int emit_semaphore_wait(struct i915_request *to, struct i915_request *from, @@ -798,7 +836,7 @@ emit_semaphore_wait(struct i915_request *to, GEM_BUG_ON(INTEL_GEN(to->i915) < 8); /* Just emit the first semaphore we see as request space is limited. */ - if (to->sched.semaphores & from->engine->mask) + if (already_busywaiting(to) & from->engine->mask) return i915_sw_fence_await_dma_fence(&to->submit, &from->fence, 0, I915_FENCE_GFP);