Add cuda profiler tools and expose it in Python.

623f62a7 · dangqingqing · 322d69f2 · 623f62a7 · 623f62a7 · 623f62a7
3 changed file
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
@@ -14,33 +14,15 @@ limitations under the License. */

 #pragma once
 #include <cuda_profiler_api.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 namespace paddle {
 namespace platform {

-static std::vector<std::string> kCudaProfileConfiguration = {
-    "gpustarttimestamp",
-    "gpuendtimestamp",
-    "gridsize3d",
-    "threadblocksize",
-    "dynsmemperblock",
-    "stasmemperblock",
-    "regperthread",
-    "memtransfersize",
-    "memtransferdir",
-    "memtransferhostmemtype",
-    "streamid",
-    "cacheconfigrequested",
-    "cacheconfigexecuted",
-    "countermodeaggregate",
-    "enableonstart 0",
-    "active_warps",
-    "active_cycles",
-};
-
-void CudaProfilerInit(std::string output_file, std::string output_mode) {
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
  std::array<char, 128> buf;
  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
@@ -52,12 +34,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) {
  {
    std::ofstream ofs(config, std::ios::out | std::ios::trunc);
    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
-    for (const auto& line : kCudaProfileConfiguration) {
+    for (const auto& line : config_flags) {
      ofs << line << std::endl;
    }
  }

-  PADDLE_ENFORCE(output_mode == "key_value" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
  PADDLE_ENFORCE(
      cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode));
@@ -66,5 +48,6 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) {
 void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }

 void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); }
-}
-}
+
+}  // namespace platform
+}  // namespace paddle
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
 import paddle.v2.fluid.core as core
+import subprocess

+__all__ = ['CudaProfiler']

-def nvporf_init(output_file, output_mode=None):
+NV_FLAGS = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+def nvporf_init(output_file, output_mode=None, flags=None):
    """
    Initialize the CUDA profiler.
    This methods must be called before nvprof_start.
@@ -10,14 +23,15 @@ def nvporf_init(output_file, output_mode=None):
    :type output_file: string
    :param output_mode: The output mode has Key-Value pair format and
                        Comma separated values format.
-                        It should be 'key-value' or 'csv'.
+                        It should be 'kv' or 'csv'.
    :type output_mode: string
    """
    if output_mode is None:
        output_mode = 'csv'
-    if output_mode != 'key-value' or output_mode != 'csv':
+    if output_mode not in ['kv', 'csv']:
        raise ValueError("The output mode must be 'key-value' or 'csv'.")
-    core.nvprof_init(output_file, output_mode)
+    flags = NV_FLAGS if flags is None else flags
+    core.nvprof_init(output_file, output_mode, flags)


 def nvporf_start():
@@ -34,13 +48,14 @@ def nvporf_stop():
    core.nvprof_stop()


-class profiler(object):
-    def __init__(self, output_file, output_mode=None, enabled=True):
+class CudaProfiler(object):
+    def __init__(self, output_file, output_mode=None, flags=None, enabled=True):
        self.enabled = enabled
        if not self.enabled:
            return
        self.entered = False
-        nvporf_init(output_file, output_mode)
+        self.out_file = output_file
+        nvporf_init(output_file, output_mode, flags)

    def __enter__(self):
        if not self.enabled:

--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.profiler as profiler
 import paddle.v2.fluid.layers as layers
-import numpy as np

-place = core.GPUPlace(0)
-exe = Executor(place)

-epoc = 8
-dshape = [4, 3, 28, 28]
-data = layers.data(name='data', shape=dshape, dtype='float32')
-conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype("float32")
+                exe.run(fluid.default_main_program(), feed={'data': input})
+

-input = core.LoDTensor()
-with profiler("cuda_profiler.txt") as nvprof:
-    for i in range(epoc):
-        input.set(np.random.random(dshape).astype("float32"), place)
-        exe.run(framework.default_main_program(), feed={'data': data})
+if __name__ == '__main__':
+    unittest.main()