drm/i915: Generate a hang error code

We get a large number of bugs which have a, "hey I have that too" because they see a GPU hang in dmesg. While two machines of the same model having a GPU hang is indeed a coincidence, it is far from enough evidence to suggest they are the same. In order to reduce this effect, and hopefully get people to file new bug reports, clearly the error message itself has been insufficient (see ref at the bottom for a new bug report with this characteristic). The algorithm is purposely pretty naive. I don't think we need much in order to avoid the problem I am trying to solve, and keeping it naive gives us some ability to make a decent test case. Cc: Jesse Barnes <jbarnes@virtuousgeek.org> References: https://bugs.freedesktop.org/show_bug.cgi?id=73276Signed-off-by: N Ben Widawsky <ben@bwidawsk.net> Signed-off-by: N Daniel Vetter <daniel.vetter@ffwll.ch>

drm/i915: Generate a hang error code
We get a large number of bugs which have a, "hey I have that too" because they see a GPU hang in dmesg. While two machines of the same model having a GPU hang is indeed a coincidence, it is far from enough evidence to suggest they are the same. In order to reduce this effect, and hopefully get people to file new bug reports, clearly the error message itself has been insufficient (see ref at the bottom for a new bug report with this characteristic). The algorithm is purposely pretty naive. I don't think we need much in order to avoid the problem I am trying to solve, and keeping it naive gives us some ability to make a decent test case. Cc: Jesse Barnes <jbarnes@virtuousgeek.org> References: https://bugs.freedesktop.org/show_bug.cgi?id=73276Signed-off-by: N Ben Widawsky <ben@bwidawsk.net> Signed-off-by: N Daniel Vetter <daniel.vetter@ffwll.ch>
011cf577 · Ben Widawsky · Daniel Vetter · 579a9b0e · 011cf577
显示空白变更内容
内联并排

Showing with 37 addition and 7 deletion

drivers/gpu/drm/i915/i915_gpu_error.c drivers/gpu/drm/i915/i915_gpu_error.c +37 -7

未找到文件。
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -653,6 +653,33 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
 	return i;
 }

+/* Generate a semi-unique error code. The code is not meant to have meaning, The
+ * code's only purpose is to try to prevent false duplicated bug reports by
+ * grossly estimating a GPU error state.
+ *
+ * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
+ * the hang if we could strip the GTT offset information from it.
+ *
+ * It's only a small step better than a random number in its current form.
+ */
+static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
+					 struct drm_i915_error_state *error)
+{
+	uint32_t error_code = 0;
+	int i;
+
+	/* IPEHR would be an ideal way to detect errors, as it's the gross
+	 * measure of "the command that hung." However, has some very common
+	 * synchronization commands which almost always appear in the case
+	 * strictly a client bug. Use instdone to differentiate those some.
+	 */
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
+			return error->ring[i].ipehr ^ error->ring[i].instdone;
+
+	return error_code;
+}
+
 static void i915_gem_record_fences(struct drm_device *dev,
 				   struct drm_i915_error_state *error)
 {
@@ -1098,6 +1125,7 @@ void i915_capture_error_state(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error;
 	unsigned long flags;
+	uint32_t ecode;

 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 	error = dev_priv->gpu_error.first_error;
@@ -1114,7 +1142,16 @@ void i915_capture_error_state(struct drm_device *dev)

 	DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
 		 dev->primary->index);
+	kref_init(&error->ref);
+
+	i915_capture_reg_state(dev_priv, error);
+	i915_gem_capture_buffers(dev_priv, error);
+	i915_gem_record_fences(dev, error);
+	i915_gem_record_rings(dev, error);
+	ecode = i915_error_generate_code(dev_priv, error);
+
 	if (!warned) {
+		DRM_INFO("GPU HANG [%x]\n", ecode);
 		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
 		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
 		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
@@ -1122,13 +1159,6 @@ void i915_capture_error_state(struct drm_device *dev)
 		warned = true;
 	}

-	kref_init(&error->ref);
-
-	i915_capture_reg_state(dev_priv, error);
-	i915_gem_capture_buffers(dev_priv, error);
-	i915_gem_record_fences(dev, error);
-	i915_gem_record_rings(dev, error);
-
 	do_gettimeofday(&error->time);

 	error->overlay = intel_overlay_capture_error_state(dev);