drm/msm: add perf logging debugfs

Signed-off-by: N Rob Clark <robdclark@gmail.com>

drm/msm: add perf logging debugfs
Signed-off-by: N Rob Clark <robdclark@gmail.com>
70c70f09 · Rob Clark · a7d3c950 · 70c70f09 · 70c70f09 · 70c70f09
7 changed file
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -34,6 +34,7 @@ msm-y := \
 	msm_gem_submit.o \
 	msm_gpu.o \
 	msm_iommu.o \
+	msm_perf.o \
 	msm_rd.o \
 	msm_ringbuffer.o

--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -207,11 +207,11 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
 	/* Turn on performance counters: */
 	gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01);
-	/* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS
+	/* Enable the perfcntrs that we use.. */
-	 * we will use this to augment our hang detection:
+	for (i = 0; i < gpu->num_perfcntrs; i++) {
-	 */
+		const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
-	gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT,
+		gpu_write(gpu, perfcntr->select_reg, perfcntr->select_val);
-			SP_FS_FULL_ALU_INSTRUCTIONS);
+	}
 	gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK);
@@ -465,6 +465,13 @@ static const struct adreno_gpu_funcs funcs = {
 	},
 };
+static const struct msm_gpu_perfcntr perfcntrs[] = {
+	{ REG_A3XX_SP_PERFCOUNTER6_SELECT, REG_A3XX_RBBM_PERFCTR_SP_6_LO,
+			SP_ALU_ACTIVE_CYCLES, "ALUACTIVE" },
+	{ REG_A3XX_SP_PERFCOUNTER7_SELECT, REG_A3XX_RBBM_PERFCTR_SP_7_LO,
+			SP_FS_FULL_ALU_INSTRUCTIONS, "ALUFULL" },
+};
 struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 {
 	struct a3xx_gpu *a3xx_gpu = NULL;
@@ -504,6 +511,9 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u",
 			gpu->fast_rate, gpu->slow_rate, gpu->bus_freq);
+	gpu->perfcntrs = perfcntrs;
+	gpu->num_perfcntrs = ARRAY_SIZE(perfcntrs);
 	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, config->rev);
 	if (ret)
 		goto fail;

--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -548,6 +548,12 @@ static int late_init_minor(struct drm_minor *minor)
 		return ret;
 	}
+	ret = msm_perf_debugfs_init(minor);
+	if (ret) {
+		dev_err(minor->dev->dev, "could not install perf debugfs\n");
+		return ret;
+	}
 	return 0;
 }
@@ -588,6 +594,7 @@ static void msm_debugfs_cleanup(struct drm_minor *minor)
 	if (!minor->dev->dev_private)
 		return;
 	msm_rd_debugfs_cleanup(minor);
+	msm_perf_debugfs_cleanup(minor);
 }
 #endif

--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -56,6 +56,7 @@ struct msm_kms;
 struct msm_gpu;
 struct msm_mmu;
 struct msm_rd_state;
+struct msm_perf_state;
 struct msm_gem_submit;
 #define NUM_DOMAINS 2    /* one for KMS, then one per gpu core (?) */
@@ -85,6 +86,7 @@ struct msm_drm_private {
 	wait_queue_head_t fence_event;
 	struct msm_rd_state *rd;
+	struct msm_perf_state *perf;
 	/* list of GEM objects: */
 	struct list_head inactive_list;
@@ -212,6 +214,8 @@ int msm_debugfs_late_init(struct drm_device *dev);
 int msm_rd_debugfs_init(struct drm_minor *minor);
 void msm_rd_debugfs_cleanup(struct drm_minor *minor);
 void msm_rd_dump_submit(struct msm_gem_submit *submit);
+int msm_perf_debugfs_init(struct drm_minor *minor);
+void msm_perf_debugfs_cleanup(struct drm_minor *minor);
 #else
 static inline int msm_debugfs_late_init(struct drm_device *dev) { return 0; }
 static inline void msm_rd_dump_submit(struct msm_gem_submit *submit) {}

--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -319,6 +319,101 @@ static void hangcheck_handler(unsigned long data)
 	queue_work(priv->wq, &gpu->retire_work);
 }
+/*
+ * Performance Counters:
+ */
+/* called under perf_lock */
+static int update_hw_cntrs(struct msm_gpu *gpu, uint32_t ncntrs, uint32_t *cntrs)
+{
+	uint32_t current_cntrs[ARRAY_SIZE(gpu->last_cntrs)];
+	int i, n = min(ncntrs, gpu->num_perfcntrs);
+	/* read current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		current_cntrs[i] = gpu_read(gpu, gpu->perfcntrs[i].sample_reg);
+	/* update cntrs: */
+	for (i = 0; i < n; i++)
+		cntrs[i] = current_cntrs[i] - gpu->last_cntrs[i];
+	/* save current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		gpu->last_cntrs[i] = current_cntrs[i];
+	return n;
+}
+static void update_sw_cntrs(struct msm_gpu *gpu)
+{
+	ktime_t time;
+	uint32_t elapsed;
+	unsigned long flags;
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	if (!gpu->perfcntr_active)
+		goto out;
+	time = ktime_get();
+	elapsed = ktime_to_us(ktime_sub(time, gpu->last_sample.time));
+	gpu->totaltime += elapsed;
+	if (gpu->last_sample.active)
+		gpu->activetime += elapsed;
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = time;
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	/* we could dynamically enable/disable perfcntr registers too.. */
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = ktime_get();
+	gpu->activetime = gpu->totaltime = 0;
+	gpu->perfcntr_active = true;
+	update_hw_cntrs(gpu, 0, NULL);
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
+{
+	gpu->perfcntr_active = false;
+}
+/* returns -errno or # of cntrs sampled */
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+{
+	unsigned long flags;
+	int ret;
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	if (!gpu->perfcntr_active) {
+		ret = -EINVAL;
+		goto out;
+	}
+	*activetime = gpu->activetime;
+	*totaltime = gpu->totaltime;
+	gpu->activetime = gpu->totaltime = 0;
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+	return ret;
+}
 /*
 * Cmdstream submission/retirement:
 */
@@ -361,6 +456,7 @@ void msm_gpu_retire(struct msm_gpu *gpu)
 {
 	struct msm_drm_private *priv = gpu->dev->dev_private;
 	queue_work(priv->wq, &gpu->retire_work);
+	update_sw_cntrs(gpu);
 }
 /* add bo's to gpu's ring, and kick gpu: */
@@ -381,6 +477,8 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	gpu->submitted_fence = submit->fence;
+	update_sw_cntrs(gpu);
 	ret = gpu->funcs->submit(gpu, submit, ctx);
 	priv->lastctx = ctx;
@@ -433,6 +531,9 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	struct iommu_domain *iommu;
 	int i, ret;
+	if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
+		gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
 	gpu->dev = drm;
 	gpu->funcs = funcs;
 	gpu->name = name;
@@ -448,6 +549,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
 			(unsigned long)gpu);
+	spin_lock_init(&gpu->perf_lock);
 	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
 	/* Map registers: */

--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -25,6 +25,7 @@
 #include "msm_ringbuffer.h"
 struct msm_gem_submit;
+struct msm_gpu_perfcntr;
 /* So far, with hardware that I've seen to date, we can have:
 *  + zero, one, or two z180 2d cores
@@ -64,6 +65,18 @@ struct msm_gpu {
 	struct drm_device *dev;
 	const struct msm_gpu_funcs *funcs;
+	/* performance counters (hw & sw): */
+	spinlock_t perf_lock;
+	bool perfcntr_active;
+	struct {
+		bool active;
+		ktime_t time;
+	} last_sample;
+	uint32_t totaltime, activetime;    /* sw counters */
+	uint32_t last_cntrs[5];            /* hw counters */
+	const struct msm_gpu_perfcntr *perfcntrs;
+	uint32_t num_perfcntrs;
 	struct msm_ringbuffer *rb;
 	uint32_t rb_iova;
@@ -113,6 +126,19 @@ static inline bool msm_gpu_active(struct msm_gpu *gpu)
 	return gpu->submitted_fence > gpu->funcs->last_fence(gpu);
 }
+/* Perf-Counters:
+ * The select_reg and select_val are just there for the benefit of the child
+ * class that actually enables the perf counter..  but msm_gpu base class
+ * will handle sampling/displaying the counters.
+ */
+struct msm_gpu_perfcntr {
+	uint32_t select_reg;
+	uint32_t sample_reg;
+	uint32_t select_val;
+	const char *name;
+};
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
 {
 	msm_writel(data, gpu->mmio + (reg << 2));
@@ -126,6 +152,11 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 int msm_gpu_pm_suspend(struct msm_gpu *gpu);
 int msm_gpu_pm_resume(struct msm_gpu *gpu);
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
 void msm_gpu_retire(struct msm_gpu *gpu);
 int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);

--- a/drivers/gpu/drm/msm/msm_perf.c
+++ b/drivers/gpu/drm/msm/msm_perf.c
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/* For profiling, userspace can:
+ *
+ *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *
+ * This will enable performance counters/profiling to track the busy time
+ * and any gpu specific performance counters that are supported.
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include "msm_drv.h"
+#include "msm_gpu.h"
+struct msm_perf_state {
+	struct drm_device *dev;
+	bool open;
+	int cnt;
+	struct mutex read_lock;
+	char buf[256];
+	int buftot, bufpos;
+	unsigned long next_jiffies;
+	struct dentry *ent;
+	struct drm_info_node *node;
+};
+#define SAMPLE_TIME (HZ/4)
+/* wait for next sample time: */
+static int wait_sample(struct msm_perf_state *perf)
+{
+	unsigned long start_jiffies = jiffies;
+	if (time_after(perf->next_jiffies, start_jiffies)) {
+		unsigned long remaining_jiffies =
+			perf->next_jiffies - start_jiffies;
+		int ret = schedule_timeout_interruptible(remaining_jiffies);
+		if (ret > 0) {
+			/* interrupted */
+			return -ERESTARTSYS;
+		}
+	}
+	perf->next_jiffies += SAMPLE_TIME;
+	return 0;
+}
+static int refill_buf(struct msm_perf_state *perf)
+{
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	char *ptr = perf->buf;
+	int rem = sizeof(perf->buf);
+	int i, n;
+	if ((perf->cnt++ % 32) == 0) {
+		/* Header line: */
+		n = snprintf(ptr, rem, "%%BUSY");
+		ptr += n;
+		rem -= n;
+		for (i = 0; i < gpu->num_perfcntrs; i++) {
+			const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
+			n = snprintf(ptr, rem, "\t%s", perfcntr->name);
+			ptr += n;
+			rem -= n;
+		}
+	} else {
+		/* Sample line: */
+		uint32_t activetime = 0, totaltime = 0;
+		uint32_t cntrs[5];
+		uint32_t val;
+		int ret;
+		/* sleep until next sample time: */
+		ret = wait_sample(perf);
+		if (ret)
+			return ret;
+		ret = msm_gpu_perfcntr_sample(gpu, &activetime, &totaltime,
+				ARRAY_SIZE(cntrs), cntrs);
+		if (ret < 0)
+			return ret;
+		val = totaltime ? 1000 * activetime / totaltime : 0;
+		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		ptr += n;
+		rem -= n;
+		for (i = 0; i < ret; i++) {
+			/* cycle counters (I think).. convert to MHz.. */
+			val = cntrs[i] / 10000;
+			n = snprintf(ptr, rem, "\t%5d.%02d",
+					val / 100, val % 100);
+			ptr += n;
+			rem -= n;
+		}
+	}
+	n = snprintf(ptr, rem, "\n");
+	ptr += n;
+	rem -= n;
+	perf->bufpos = 0;
+	perf->buftot = ptr - perf->buf;
+	return 0;
+}
+static ssize_t perf_read(struct file *file, char __user *buf,
+		size_t sz, loff_t *ppos)
+{
+	struct msm_perf_state *perf = file->private_data;
+	int n = 0, ret;
+	mutex_lock(&perf->read_lock);
+	if (perf->bufpos >= perf->buftot) {
+		ret = refill_buf(perf);
+		if (ret)
+			goto out;
+	}
+	n = min((int)sz, perf->buftot - perf->bufpos);
+	ret = copy_to_user(buf, &perf->buf[perf->bufpos], n);
+	if (ret)
+		goto out;
+	perf->bufpos += n;
+	*ppos += n;
+out:
+	mutex_unlock(&perf->read_lock);
+	if (ret)
+		return ret;
+	return n;
+}
+static int perf_open(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct drm_device *dev = perf->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	int ret = 0;
+	mutex_lock(&dev->struct_mutex);
+	if (perf->open || !gpu) {
+		ret = -EBUSY;
+		goto out;
+	}
+	file->private_data = perf;
+	perf->open = true;
+	perf->cnt = 0;
+	perf->buftot = 0;
+	perf->bufpos = 0;
+	msm_gpu_perfcntr_start(gpu);
+	perf->next_jiffies = jiffies + SAMPLE_TIME;
+out:
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
+}
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	msm_gpu_perfcntr_stop(priv->gpu);
+	perf->open = false;
+	return 0;
+}
+static const struct file_operations perf_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = perf_open,
+	.read = perf_read,
+	.llseek = no_llseek,
+	.release = perf_release,
+};
+int msm_perf_debugfs_init(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf;
+	/* only create on first minor: */
+	if (priv->perf)
+		return 0;
+	perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+	perf->dev = minor->dev;
+	mutex_init(&perf->read_lock);
+	priv->perf = perf;
+	perf->node = kzalloc(sizeof(*perf->node), GFP_KERNEL);
+	if (!perf->node)
+		goto fail;
+	perf->ent = debugfs_create_file("perf", S_IFREG | S_IRUGO,
+			minor->debugfs_root, perf, &perf_debugfs_fops);
+	if (!perf->ent) {
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/perf\n",
+				minor->debugfs_root->d_name.name);
+		goto fail;
+	}
+	perf->node->minor = minor;
+	perf->node->dent  = perf->ent;
+	perf->node->info_ent = NULL;
+	mutex_lock(&minor->debugfs_lock);
+	list_add(&perf->node->list, &minor->debugfs_list);
+	mutex_unlock(&minor->debugfs_lock);
+	return 0;
+fail:
+	msm_perf_debugfs_cleanup(minor);
+	return -1;
+}
+void msm_perf_debugfs_cleanup(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf = priv->perf;
+	if (!perf)
+		return;
+	priv->perf = NULL;
+	debugfs_remove(perf->ent);
+	if (perf->node) {
+		mutex_lock(&minor->debugfs_lock);
+		list_del(&perf->node->list);
+		mutex_unlock(&minor->debugfs_lock);
+		kfree(perf->node);
+	}
+	mutex_destroy(&perf->read_lock);
+	kfree(perf);
+}
+#endif