diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
index d3a6e597271f9c6e4b42f0da79f6452f02e76ddc..c096ce37c56d5d6c34d543dcd6889a560e44286c 100644
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
@@ -14,33 +14,15 @@ limitations under the License. */
 
 #pragma once
 #include <cuda_profiler_api.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 namespace paddle {
 namespace platform {
 
-static std::vector<std::string> kCudaProfileConfiguration = {
-    "gpustarttimestamp",
-    "gpuendtimestamp",
-    "gridsize3d",
-    "threadblocksize",
-    "dynsmemperblock",
-    "stasmemperblock",
-    "regperthread",
-    "memtransfersize",
-    "memtransferdir",
-    "memtransferhostmemtype",
-    "streamid",
-    "cacheconfigrequested",
-    "cacheconfigexecuted",
-    "countermodeaggregate",
-    "enableonstart 0",
-    "active_warps",
-    "active_cycles",
-};
-
-void CudaProfilerInit(std::string output_file, std::string output_mode) {
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
   std::array<char, 128> buf;
   std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
   PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
@@ -52,12 +34,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) {
   {
     std::ofstream ofs(config, std::ios::out | std::ios::trunc);
     PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
-    for (const auto& line : kCudaProfileConfiguration) {
+    for (const auto& line : config_flags) {
       ofs << line << std::endl;
     }
   }
 
-  PADDLE_ENFORCE(output_mode == "key_value" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE(
       cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode));
@@ -66,5 +48,6 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) {
 void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
 
 void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); }
-}
-}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index b94ef67b48ec00a329e875fc671759fda6d925f6..f31d6f0a617c42601c164603692d59f8d722c48b 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -1,7 +1,20 @@
 import paddle.v2.fluid.core as core
+import subprocess
 
+__all__ = ['CudaProfiler']
 
-def nvporf_init(output_file, output_mode=None):
+NV_FLAGS = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+def nvporf_init(output_file, output_mode=None, flags=None):
     """
     Initialize the CUDA profiler.
     This methods must be called before nvprof_start.
@@ -10,14 +23,15 @@ def nvporf_init(output_file, output_mode=None):
     :type output_file: string
     :param output_mode: The output mode has Key-Value pair format and
                         Comma separated values format.
-                        It should be 'key-value' or 'csv'.
+                        It should be 'kv' or 'csv'.
     :type output_mode: string
     """
     if output_mode is None:
         output_mode = 'csv'
-    if output_mode != 'key-value' or output_mode != 'csv':
+    if output_mode not in ['kv', 'csv']:
         raise ValueError("The output mode must be 'key-value' or 'csv'.")
-    core.nvprof_init(output_file, output_mode)
+    flags = NV_FLAGS if flags is None else flags
+    core.nvprof_init(output_file, output_mode, flags)
 
 
 def nvporf_start():
@@ -34,13 +48,14 @@ def nvporf_stop():
     core.nvprof_stop()
 
 
-class profiler(object):
-    def __init__(self, output_file, output_mode=None, enabled=True):
+class CudaProfiler(object):
+    def __init__(self, output_file, output_mode=None, flags=None, enabled=True):
         self.enabled = enabled
         if not self.enabled:
             return
         self.entered = False
-        nvporf_init(output_file, output_mode)
+        self.out_file = output_file
+        nvporf_init(output_file, output_mode, flags)
 
     def __enter__(self):
         if not self.enabled:
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index 7da7a28cf6e5cee0f5633e31703a9833963cade1..1fec5c99bf76a7706a1ae529b4d12aa0dad4da57 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -1,17 +1,28 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.profiler as profiler
 import paddle.v2.fluid.layers as layers
-import numpy as np
 
-place = core.GPUPlace(0)
-exe = Executor(place)
 
-epoc = 8
-dshape = [4, 3, 28, 28]
-data = layers.data(name='data', shape=dshape, dtype='float32')
-conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype("float32")
+                exe.run(fluid.default_main_program(), feed={'data': input})
+
 
-input = core.LoDTensor()
-with profiler("cuda_profiler.txt") as nvprof:
-    for i in range(epoc):
-        input.set(np.random.random(dshape).astype("float32"), place)
-        exe.run(framework.default_main_program(), feed={'data': data})
+if __name__ == '__main__':
+    unittest.main()