diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3a6e597271f9c6e4b42f0da79f6452f02e76ddc
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+static std::vector<std::string> kCudaProfileConfiguration = {
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "dynsmemperblock",
+    "stasmemperblock",
+    "regperthread",
+    "memtransfersize",
+    "memtransferdir",
+    "memtransferhostmemtype",
+    "streamid",
+    "cacheconfigrequested",
+    "cacheconfigexecuted",
+    "countermodeaggregate",
+    "enableonstart 0",
+    "active_warps",
+    "active_cycles",
+};
+
+void CudaProfilerInit(std::string output_file, std::string output_mode) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config = result;
+
+  {
+    std::ofstream ofs(config, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : kCudaProfileConfiguration) {
+      ofs << line << std::endl;
+    }
+  }
+
+  PADDLE_ENFORCE(output_mode == "key_value" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); }
+}
+}
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f55a1edce31ccf2498dcfcf0b30ba1012d7a7d1a..c16d3e0cbe01f90a5aa9a5d7a523cd4e282e4771 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94ef67b48ec00a329e875fc671759fda6d925f6
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,59 @@
+import paddle.v2.fluid.core as core
+
+
+def nvporf_init(output_file, output_mode=None):
+    """
+    Initialize the CUDA profiler.
+    This methods must be called before nvprof_start.
+
+    :param output_file: The output file name.
+    :type output_file: string
+    :param output_mode: The output mode has Key-Value pair format and
+                        Comma separated values format.
+                        It should be 'key-value' or 'csv'.
+    :type output_mode: string
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode != 'key-value' or output_mode != 'csv':
+        raise ValueError("The output mode must be 'key-value' or 'csv'.")
+    core.nvprof_init(output_file, output_mode)
+
+
+def nvporf_start():
+    """
+    Enables profiler collection by the active CUDA profiling tool.
+    """
+    core.nvprof_start()
+
+
+def nvporf_stop():
+    """
+    Disables profiler collection.
+    """
+    core.nvprof_stop()
+
+
+class profiler(object):
+    def __init__(self, output_file, output_mode=None, enabled=True):
+        self.enabled = enabled
+        if not self.enabled:
+            return
+        self.entered = False
+        nvporf_init(output_file, output_mode)
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("The profiler traces are not reentrant")
+        self.entered = True
+        nvporf_start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_value is not None:
+            raise exc_value
+        if not self.enabled:
+            return
+        nvporf_stop()
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7da7a28cf6e5cee0f5633e31703a9833963cade1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,17 @@
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+import numpy as np
+
+place = core.GPUPlace(0)
+exe = Executor(place)
+
+epoc = 8
+dshape = [4, 3, 28, 28]
+data = layers.data(name='data', shape=dshape, dtype='float32')
+conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+input = core.LoDTensor()
+with profiler("cuda_profiler.txt") as nvprof:
+    for i in range(epoc):
+        input.set(np.random.random(dshape).astype("float32"), place)
+        exe.run(framework.default_main_program(), feed={'data': data})