diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 811fe03a67f037eeb836717ac6f72e97dd47cd1f..d9e8921f0849c4acfebad1f5ff4c711a81c9796a 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,8 +120,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
     VLOG(3) << op->DebugStringEx(local_scope);
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto dev_ctx = const_cast<platform::DeviceContext*>(pool.Get(place_));
-    platform::RecordEvent record_event(op->Type(), dev_ctx);
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
 
     op->Run(*local_scope, place_);
     if (FLAGS_do_memory_benchmark) {
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 8175b827c3e00769a89a519b909479a27a22b046..2a8afc940393baaaa939471f50f2d5c63edd6a84 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
 }
 
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
-             DeviceContext* dev_ctx)
+             const DeviceContext* dev_ctx)
     : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
-  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-  if (cuda_dev_ctx) {
+  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
+  if (has_cuda_) {
+    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
     PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-    has_cuda_ = true;
   }
 #endif
   cpu_ns_ = GetTimeInNsec();
@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
   return *g_event_list;
 }
 
-void Mark(const std::string& name, DeviceContext* dev_ctx) {
+void Mark(const std::string& name, const DeviceContext* dev_ctx) {
   GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
 }
 
-void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
   GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
 }
 
-void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
   GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
-RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name,
+                         const DeviceContext* dev_ctx) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
   name_ = name;
@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
         DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
         Mark("_cuda_startup_", dev_ctx);
         dev_ctx->Wait();
+        delete dev_ctx;
       });
     }
   }
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index 85823af1d7dfcc7f032878ebb9d63ea0b5f7a257..8de1e6ad296d1e15c1659ccf431f1d5013eb608c 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -29,7 +29,7 @@ class Event {
   // The DeviceContext is used to get the cuda stream.
   // If CPU profiling mode, can pass nullptr.
   Event(EventKind kind, std::string name, uint32_t thread_id,
-        DeviceContext* dev_ctx);
+        const DeviceContext* dev_ctx);
 
   std::string kind() const;
   std::string name() const { return name_; }
@@ -95,19 +95,19 @@ enum ProfilerState {
   kCUDA,      // GPU profiling state
 };
 
-void Mark(const std::string& name, DeviceContext* dev_ctx);
+void Mark(const std::string& name, const DeviceContext* dev_ctx);
 
-void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 
-void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
+  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
 
   ~RecordEvent();
 
   // The device context is used by Event to get the current cuda stream.
-  DeviceContext* dev_ctx_;
+  const DeviceContext* dev_ctx_;
   // Event name
   std::string name_;
 };
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index a5f0f189ab623837d3d755a8474a54c6bc4e0855..51c1c8aa705513825b46fb936c6c99090c50fb7d 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -81,10 +81,11 @@ def profiler(state, sorted_key=None):
     to add more records.
 
     Args:
-        state (string) : The profiling state, It should be 'CPU' or 'GPU'.
-            Although users may define CPUPlace or CUDAPlace when using Fluid,
-            the profiler doesn't get the state based on this Place. Since the
-            implementation is an independent part from the Fluid.
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
             results will be sorted by the this flag. This flag should be one
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index dfee4e2722ceaf9422bbbcb9e319985f1574c6a3..34700df37d22cf71bad2d86efa4718a3767c2d4f 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -41,8 +41,8 @@ class TestProfiler(unittest.TestCase):
                 exe.run(fluid.default_main_program(), feed={'data': input})
         os.remove(output_file)
 
-    def profiler(self, state):
-        if state == 'GPU' and core.is_compile_gpu():
+    def net_profiler(self, state):
+        if state == 'GPU' and not core.is_compile_gpu():
             return
         startup_program = fluid.Program()
         main_program = fluid.Program()
@@ -79,11 +79,11 @@ class TestProfiler(unittest.TestCase):
                 acc = np.array(outs[1])
                 pass_acc = accuracy.eval(exe)
 
-    def not_test_cpu_profiler(self):
-        self.profiler('CPU')
+    def test_cpu_profiler(self):
+        self.net_profiler('CPU')
 
-    def not_test_cuda_profiler(self):
-        self.profiler('GPU')
+    def test_cuda_profiler(self):
+        self.net_profiler('GPU')
 
 
 if __name__ == '__main__':