Merge branch 'develop' into core_opt_choose_kernel

d8a939d8 · Liu Yiqun · d4674dab · cae6614c · d8a939d8 · d8a939d8
38 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -38,10 +38,10 @@ if(WITH_GPU)
    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
    add_dependencies(tensor tensor_util)
  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
 endif()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  std::vector<FetchOpHandle *> fetch_ops;

  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -14,8 +14,11 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include <algorithm>
 #include <limits>
+#include <memory>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
+    platform::RecordEvent record_event("TensorCopy:GPU->CPU");
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
+    platform::RecordEvent record_event("TensorCopy:CPU->GPU");
    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
+    platform::RecordEvent record_event("TensorCopy:GPU->GPU");
    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
      VLOG(3) << "Skip copy the same data from " << src_place << " to "
              << dst_place;
@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
  } else if (platform::is_cuda_pinned_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
+    platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,

--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
 add_subdirectory(detail)
 add_subdirectory(allocation)
-cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
+cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
 cc_library(memcpy SRCS memcpy.cc DEPS place)

 cc_library(memory

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"

 #include <cstring>  // for memcpy
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace memory {
@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K

+// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
+// because GpuMemcpySync issues the copying command to the default stream,
+// which will make two commands from different streams cannot run concurrently.
+// Reference:
+// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
+
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
    const void* src, size_t num, cudaStream_t stream) {
  platform::SetDeviceId(src_place.device);
+
  if (stream) {
+    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
  } else {
+    platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
    // FIXME(zjl): do we really need it?
    if (num <= kMaxGpuAsyncCopyBytes) {
@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
    const void* src, size_t num, cudaStream_t stream) {
  platform::SetDeviceId(dst_place.device);
  if (stream) {
+    platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
  } else {
+    platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
    // FIXME(zjl): do we really need it?
    if (num <= kMaxGpuAsyncCopyBytes) {
@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
  if (dst_place == src_place) {
    platform::SetDeviceId(src_place.device);
    if (stream) {
+      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
    } else {
+      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
      platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
    }
  } else {
    if (stream) {
+      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
                                   num, stream);
    } else {
+      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
                                  num);
    }
@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
    cudaStream_t stream) {
  platform::SetDeviceId(src_place.device);
  if (stream) {
+    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
  } else {
+    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
  }
 }
@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
    cudaStream_t stream) {
  platform::SetDeviceId(dst_place.device);
  if (stream) {
+    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
  } else {
+    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
  }
 }

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <algorithm>
 #include <functional>
+#include <memory>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>


--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"

--- a/paddle/fluid/operators/ngraph/ops/momentum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"

--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -13,9 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"

+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
                                             .Get(place_)))
            ->stream();
    events.resize(buffer_size);
-    for (auto &event : events)
+    PADDLE_ENFORCE(cudaStreamCreate(&stream));
+    for (auto &event : events) {
      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    }
  }
 #endif
  cpu_buffer_.resize(buffer_size);
@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {

 #ifdef PADDLE_WITH_CUDA
    // NOTE(liangdun): using async copy instead of TensorCopySync
-    // TensorCopySync would block other stream
+    // TensorCopySync would block other stream, because TensorCopySync
+    // issues the copying command to the default stream, it will make two
+    // commands from different streams cannot run concurrently.
    if (platform::is_gpu_place(place_)) {
      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
      TensorVec &gpu = gpu_buffer_[i];
      gpu.resize(cpu.size());
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
      for (size_t i = 0; i < cpu.size(); ++i) {
        gpu[i].Resize(cpu[i].dims());
        gpu[i].set_layout(cpu[i].layout());
@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
        auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
        auto size =
            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
-        if (platform::is_cuda_pinned_place(cpu_place))
+        if (platform::is_cuda_pinned_place(cpu_place)) {
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CUDAPinnedPlace>(cpu_place),
                       cpu_ptr, size, stream);
-        else if ((platform::is_gpu_place(cpu_place)))
+        } else if ((platform::is_gpu_place(cpu_place))) {
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
                       size, stream);
-        else
-          // if cpu place is not pinned, async copy is slower than sync copy,
-          // so we use sync copy instead.
+        } else {
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
-                       0);
+                       stream);
+        }
        gpu[i].set_lod(cpu[i].lod());
      }
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,7 +30,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"

 namespace paddle {
@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
          }
          case CUPTI_ACTIVITY_KIND_DRIVER: {
            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
-            if (api->start != 0 && api->end != 0)
-              // -1 device id represents CUDA api call
-              tracer->AddCPURecords(
+            if (api->start != 0 && api->end != 0) {
+              // -1 device id represents ActiveKind api call
+              tracer->AddActiveKindRecords(
                  DriverKind(api->cbid), api->start, api->end, -1,
-                  GetThreadIdFromSystemThreadId(api->threadId));
+                  GetThreadIdFromSystemThreadId(api->threadId),
+                  api->correlationId);
+            }
            break;
          }
          case CUPTI_ACTIVITY_KIND_RUNTIME: {
            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
-            if (api->start != 0 && api->end != 0)
-              tracer->AddCPURecords(
+            if (api->start != 0 && api->end != 0) {
+              // -1 device id represents ActiveKind api call
+              tracer->AddActiveKindRecords(
                  RuntimeKind(api->cbid), api->start, api->end, -1,
-                  GetThreadIdFromSystemThreadId(api->threadId));
+                  GetThreadIdFromSystemThreadId(api->threadId),
+                  api->correlationId);
+            }
            break;
          }
          default: { break; }
@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
  }

+  void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
+                            uint64_t end_ns, int64_t device_id,
+                            int64_t thread_id, uint32_t correlation_id) {
+    if (anno.empty()) {
+      VLOG(1) << "Empty timeline annotation.";
+      return;
+    }
+    thread_local std::forward_list<ActiveKindRecord>
+        *local_active_kind_records = nullptr;
+    if (local_active_kind_records == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      active_kind_records_.emplace_front();
+      local_active_kind_records = &active_kind_records_.front();
+    }
+    //  lock is not needed, only one thread call this function.
+    local_active_kind_records->push_front(ActiveKindRecord{
+        anno, start_ns, end_ns, device_id, thread_id, correlation_id});
+  }
+
  void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
                        int64_t device_id, int64_t stream_id,
                        uint32_t correlation_id) {
@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
    }
    const std::vector<int> cbids {
      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_.clear();
    for (auto &tmp : correlations_pairs) tmp.clear();
    for (auto &tmp : cpu_records_) tmp.clear();
+    for (auto &tmp : active_kind_records_) tmp.clear();
  }

  void GenEventKernelCudaElapsedTime() {
@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_device_id(r.device_id);
    }
    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
-    for (auto &tmp : cpu_records_)
+    for (auto &tmp : cpu_records_) {
      for (const CPURecord &r : tmp) {
        auto *event = profile_pb.add_events();
        event->set_type(proto::Event::CPU);
@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
        event->set_sub_device_id(r.thread_id);
        event->set_device_id(r.device_id);
      }
+    }
+    for (auto &tmp : active_kind_records_) {
+      for (const ActiveKindRecord &r : tmp) {
+        auto *event = profile_pb.add_events();
+        event->set_type(proto::Event::CPU);
+        auto c = correlations_.find(r.correlation_id);
+        if (c != correlations_.end() && c->second != nullptr) {
+          event->set_name(c->second->name());
+          event->set_detail_info(r.name);
+        } else {
+          event->set_name(r.name);
+        }
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_sub_device_id(r.thread_id);
+        event->set_device_id(r.device_id);
+      }
+    }
    miss = find = 0;
    for (const MemRecord &r : mem_records_) {
      auto *event = profile_pb.add_events();
@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
  std::forward_list<KernelRecord> kernel_records_;
  std::forward_list<MemRecord> mem_records_;
  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
      correlations_pairs;
  std::unordered_map<uint32_t, Event *> correlations_;
@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
 #if CUDA_VERSION >= 9000
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -63,7 +63,14 @@ class DeviceTracer {
    uint32_t correlation_id;
    uint64_t bytes;
  };
-
+  struct ActiveKindRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    int64_t device_id;
+    int64_t thread_id;
+    uint32_t correlation_id;
+  };
  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
  virtual void Enable() = 0;
@@ -85,6 +92,10 @@ class DeviceTracer {
  virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
                             uint64_t end_ns, int64_t device_id,
                             int64_t thread_id) = 0;
+  virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
+                                    uint64_t end_ns, int64_t device_id,
+                                    int64_t thread_id,
+                                    uint32_t correlation_id) = 0;

  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -415,10 +415,11 @@ function assert_api_not_changed() {
    source .env/bin/activate
    pip install ${PADDLE_ROOT}/build/python/dist/*whl
    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
+
    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
        # Use sed to make python2 and python3 sepc keeps the same
        sed -i 's/arg0: str/arg0: unicode/g' new.spec
-        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
    fi
    # ComposeNotAligned has significant difference between py2 and py3
    sed -i '/.*ComposeNotAligned.*/d' new.spec
@@ -452,11 +453,20 @@ function assert_api_spec_approvals() {
      echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+          else
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+          fi
          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
          if [ "${APPROVALS}" == "FALSE" ]; then
+            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+            else
              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
+            fi
            exit 1
          fi
      fi
@@ -472,19 +482,6 @@ function assert_api_spec_approvals() {
            exit 1
        fi
    fi
-
-    pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
-    CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
-    if [ "True" != ${CHECK_DOCK_MD5} ]; then
-        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
-        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-        if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have shanyi15 approval for the api doc change! "
-            exit 1
-        fi
-        echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
-    fi
 }



--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,7 +17,6 @@ import os
 import six
 import sys
 from .. import compat as cpt
-from . import framework

 from . import core
 from . import framework
@@ -36,6 +35,30 @@ def _place_obj(place):
    return p


+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
+def get_available_places(use_cuda):
+    if use_cuda:
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            gpus = [int(s) for s in gpus_env.split(",")]
+        else:
+            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
+        places = [core.CUDAPlace(i) for i in gpus]
+    else:
+        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
+    assert places, "no place for execution"
+    return places
+
+
 class CompiledProgram(object):
    """
    Compiles to Graph for execution.
@@ -127,8 +150,7 @@ class CompiledProgram(object):
            self._exec_strategy = ExecutionStrategy()
        if self._build_strategy is None:
            self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = framework.is_pserver_mode(
-            self._program)
+        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
        return self

    def with_inference_optimize(self, config):
@@ -153,9 +175,9 @@ class CompiledProgram(object):
    def _with_distributed(self):
        raise NotImplementedError()

-    def _compile_data_parallel(self):
+    def _compile_data_parallel(self, use_cuda=False, scope=None):
        if self._share_vars_from:
-            if self._scope:
+            if scope:
                sys.stderr.write("share_vars_from is set, scope is ignored.\n")
            if not self._share_vars_from._is_data_parallel:
                raise ValueError("share_vars_from is not data parallel. Cannot "
@@ -166,23 +188,11 @@ class CompiledProgram(object):
                    "var to share.")
            self._local_scopes = self._share_vars_from._executor.local_scopes()
        else:
+            assert scope is not None, ""
            self._local_scopes = []

-        self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
-        if self._exec_strategy.use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"
+        self._exec_strategy.use_cuda = use_cuda
+        self._places = get_available_places(self._exec_strategy.use_cuda)

        if self._exec_strategy.num_threads == 0:
            if self._exec_strategy.use_cuda:
@@ -197,9 +207,11 @@ class CompiledProgram(object):
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
        if self._build_strategy.memory_optimize is None:
-            self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
+            self._build_strategy.memory_optimize = False \
+                if self._program and self._program._is_mem_optimized else True
        if self._build_strategy.enable_inplace is None:
-            self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
+            self._build_strategy.enable_inplace = False \
+                if self._program and self._program._is_mem_optimized else True

        # TODO(wuyi): trainer endpoings should be passed in through
        # build_strategy, not program.xxx.
@@ -221,12 +233,12 @@ class CompiledProgram(object):

        places = list(map(_place_obj, self._places))

-        return core.ParallelExecutor(
-            places,
+        return core.ParallelExecutor(places,
                                     set(self._persistable_vars),
                                     cpt.to_text(self._loss_name)
-            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy, self._graph)
+                                     if self._loss_name else six.u(''), scope,
+                                     self._local_scopes, self._exec_strategy,
+                                     self._build_strategy, self._graph)

    def _compile_inference(self):
        return core.create_paddle_predictor(self._infer_config)
@@ -253,7 +265,9 @@ class CompiledProgram(object):
        self._scope = scope
        self._place = place
        if self._is_data_parallel:
-            self._executor = self._compile_data_parallel()
+            self._executor = self._compile_data_parallel(
+                use_cuda=isinstance(self._place, core.CUDAPlace),
+                scope=self._scope)
        elif self._is_inference:
            self._executor = self._compile_inference()
        else:

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -261,20 +261,20 @@ def _as_lodtensor(data, place):

 class Executor(object):
    """
-    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
-    ParallelExecutor.
-    Python executor takes a program, add feed operators and fetch operators to this program according
+    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
+    Python executor takes a program, adds feed operators and fetch operators to this program according
    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-    the variables(or names) that user want to get after program run. Note: the executor will run all
+    the variables(or names) that user wants to get after program runs. Note: the executor will run all
    operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
-    But the global scope variables will be persistent through different runs.
-    All of ops in program will be running in sequence.
+    It stores the global variables into the global scope, and creates a local scope for the temporary
+    variables. The contents in local scope may be discarded after every minibatch forward/backward
+    finished. But the global scope variables will be persistent through different runs.


    Example:
+
        .. code-block:: python
+
            # First create the Executor.
            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
@@ -297,9 +297,6 @@ class Executor(object):

    Args:
        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
-
-    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
-    They has the exactly same arguments, and expected the same results.
    """

    def __init__(self, place):
@@ -382,6 +379,12 @@ class Executor(object):
        ]
        return outs

+    '''
+    TODO(typhoonzero): Define "no longer use" meaning? Can user create
+    a new Executor for the same program and run?
+    TODO(panyx0718): Why ParallelExecutor doesn't have close?
+    '''
+
    def close(self):
        """
        Close this executor.
@@ -389,9 +392,6 @@ class Executor(object):
        You can no longer use this executor after calling this method.
        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.
-        TODO(typhoonzero): Define "no longer use" meaning? Can user create
-        a new Executor for the same program and run?
-        TODO(panyx0718): Why ParallelExecutor doesn't have close?

        Example:
            >>> cpu = core.CPUPlace()

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -87,15 +87,6 @@ def _current_expected_place():
    return _imperative_current_expected_place_


-def is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class NameScope(object):
    def __init__(self, name="", parent=None):
        self._children = dict()

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):

            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
+            # `prog` can be a program defined by the user
            prog = fluid.default_main_program()
            fluid.io.save_persistables(executor=exe, dirname=param_path,
-                                       main_program=None)
+                                       main_program=prog)
    """

    if main_program and main_program._is_distributed:

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -13,15 +13,11 @@
 # limitations under the License.

 from __future__ import print_function
-import multiprocessing
 from . import core
 from . import framework
 from . import executor
-from .. import compat as cpt
-import warnings
+from . import compiler
 import sys
-import six
-import os

 __all__ = ['ParallelExecutor']

@@ -97,99 +93,27 @@ class ParallelExecutor(object):
            'Please use CompiledProgram and Executor. CompiledProgram '
            'is a central place for optimization and Executor is the '
            'unified executor. Example can be found in compiler.py.\n')
-        # step1: get places, the places are used in run too.
-        self._places = []
-        if use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"

-        # step2: init exec_strategy
-        if exec_strategy is None:
-            exec_strategy = ExecutionStrategy()
-        exec_strategy.use_cuda = use_cuda
-        if exec_strategy.num_threads == 0:
-            if use_cuda:
-                # Experiments on se-resnext shows that too many threads hurt
-                # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 4
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num * 2
-
-        # step3: init build_strategy
        if build_strategy is None:
            build_strategy = BuildStrategy()
        build_strategy.num_trainers = num_trainers
        build_strategy.trainer_id = trainer_id
-        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
-        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
-        # it's distributed model.
-        build_strategy.is_distribution = framework.is_pserver_mode(
-            main_program) or num_trainers > 1
-
-        # step4: get main_program, scope, local_scopes
-        main = main_program if main_program \
-            else framework.default_main_program()
-        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
-        # if turn on python memory optimize, turn off the inplace_pass.
-        if build_strategy.memory_optimize is None:
-            build_strategy.memory_optimize = False if main._is_mem_optimized else True
-        if build_strategy.enable_inplace is None:
-            build_strategy.enable_inplace = False if main._is_mem_optimized else True
-        scope = scope if scope is not None else executor.global_scope()
-
-        if share_vars_from and not isinstance(share_vars_from,
-                                              ParallelExecutor):
-            raise TypeError("share_vars_from must be ParallelExecutor.")
-
-        local_scopes = share_vars_from.executor.local_scopes()\
-            if share_vars_from else []
-
-        # step5: check trainers_endpoints, it is used for distribution.
-        trainers_endpoints = main._trainers_endpoints
-        if num_trainers > 1 and trainers_endpoints:
-            assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(endpoints)"
-            build_strategy.trainers_endpoints = trainers_endpoints
-
-        # step6: get persistable_vars, places. persistable_vars
-        # need be broadcast to other local_scope.
-        persistable_vars = set([
-            cpt.to_text(v.name) for v in [
-                var for var in main.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
-
-        def place_obj(place):
-            p = core.Place()
-            p.set_place(place)
-            return p
-
-        places = list(map(place_obj, self._places))

-        # step7: init ParallelExecutor
-        # ParallelExecutor API will be deprecated, don't support parallel graph.
-        self._graph = core.Graph(main.desc)
+        self._places = compiler.get_available_places(use_cuda)
+        self._scope = scope if scope is not None else executor.global_scope()

-        self.executor = core.ParallelExecutor(
-            places, persistable_vars,
-            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy, self._graph)
+        main_program = main_program if main_program is not None \
+            else framework.default_main_program()

-        self.scope = scope
+        self._compiled_program = compiler.CompiledProgram(main_program)
+        self._compiled_program.with_data_parallel(
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy,
+            share_vars_from=share_vars_from)
+        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
+        self._executor = executor.Executor(self._place)
+        self._compiled_program._compile(place=self._place, scope=self._scope)

    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
        """
@@ -256,56 +180,11 @@ class ParallelExecutor(object):
                loss = pe.run(feed=feeder.feed(cur_batch),
                              fetch_list=[avg_cost.name]))
        """
-        if feed is None and feed_dict is not None:
-            feed = feed_dict
-            print(
-                "`feed_dict` is deprecated. Please use `feed=`",
-                file=sys.stderr)
-
-        if isinstance(feed, dict):
-            feed_tensor_dict = dict()
-            for feed_name in feed:
-                feed_tensor = feed[feed_name]
-                if not isinstance(feed_tensor, core.LoDTensor):
-                    feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be splitted
-                    # it is fast in CPU
-                    feed_tensor.set(feed[feed_name], core.CPUPlace())
-                feed_tensor_dict[feed_name] = feed_tensor
-
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
-        elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(self._places):
-                raise ValueError(
-                    "Feed a list of tensor, the list should be the same size as places"
-                )
-
-            res = list()
-
-            for i, each in enumerate(feed):
-                if not isinstance(each, dict):
-                    raise TypeError(
-                        "Each element of feed list should be a dict")
-                res_dict = dict()
-                for feed_name in each:
-                    tensor = each[feed_name]
-                    if not isinstance(tensor, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(tensor, self._places[i])
-                        tensor = tmp
-                    res_dict[feed_name] = tensor
-                res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
-
-        fetch_var_name = 'fetch'
-        self.executor.run(fetch_list, fetch_var_name)
-        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-        if return_numpy:
-            return executor.as_numpy(arr)
-
-        return [arr[i] for i in range(len(arr))]
+        return self._executor.run(program=self._compiled_program,
+                                  scope=self._scope,
+                                  feed=feed,
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)

    @property
    def device_count(self):

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -15,44 +15,139 @@
 from __future__ import print_function

 import unittest
+import numpy as np

-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp


-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+def conv2d_bias_naive(out, bias):
+    _, out_c, _, _ = out.shape

+    for l in range(out_c):
+        out[:, l, :, :] = out[:, l, :, :] + bias[l]
+    return out

-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"

+def conv2d_residual_naive(out, residual):
+    assert out.shape == residual.shape
+    out = np.add(out, residual)
+    return out

-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"

+class TestConv2dMKLDNNOp(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1

-class TestMKLDNNWithGroup(TestWithGroup):
    def init_kernel_type(self):
-        self.use_mkldnn = True
        self.data_format = "NCHW"
+        self.use_mkldnn = True
+        self._cpu_only = True

+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]

-class TestMKLDNNWith1x1(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+    def setUp(self):
+        self.fuse_bias = False
+        self.bias_size = None
+        self.fuse_relu = False
+        self.fuse_residual_connection = False
+        self.input_residual_size = None
+        TestConv2dOp.setUp(self)

+        output = self.outputs['Output']

-class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+        #mkldnn only support either conv-sum-relu, or conv-relu.
+        if self.fuse_bias and self.bias_size is not None:
+            bias = np.random.random(self.bias_size).astype(self.dtype)
+            output = conv2d_bias_naive(output, bias)
+            output = output.astype(self.dtype)
+            self.attrs['fuse_bias'] = self.fuse_bias
+            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
+
+        if self.fuse_residual_connection and self.input_residual_size is not None:
+            input_residual = np.random.random(self.input_residual_size).astype(
+                self.dtype)
+            output = conv2d_residual_naive(output, input_residual)
+
+            self.attrs[
+                'fuse_residual_connection'] = self.fuse_residual_connection
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                input_residual)
+
+        if self.fuse_relu:
+            output = np.maximum(output, 0).astype(self.dsttype)
+
+        output = output.astype(self.dtype)
+
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
+        self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
+
+        self.outputs['Output'] = output
+
+
+class TestWithFuse(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.fuse_bias = True
+        self.bias_size = [6]
+        self.fuse_residual_connection = True
+        self.input_residual_size = [2, 6, 5, 5]
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+
+class TestWithPadWithBias(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithStride(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithGroup(TestConv2dMKLDNNOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.filter_size = [6, 3, 1, 1]
+
+
+class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -18,6 +18,24 @@ import unittest
 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5


+def create_test_mkldnn_use_ceil_class(parent):
+    class TestMKLDNNPool2DUseCeilCase(parent):
+        def init_kernel_type(self):
+            self.use_mkldnn = True
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
+    TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
+
+
+create_test_mkldnn_use_ceil_class(TestPool2D_Op)
+create_test_mkldnn_use_ceil_class(TestCase1)
+create_test_mkldnn_use_ceil_class(TestCase2)
+
+
 def create_test_mkldnn_class(parent):
    class TestMKLDNNCase(parent):
        def init_kernel_type(self):

--- a/tools/check_doc_approval.py
+++ b/tools/check_doc_approval.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import ast
-import hashlib
-import importlib
-import paddle.fluid
-
-files = [
-    "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
-    "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
-    "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
-    "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
-    "paddle.fluid.profiler", "paddle.fluid.recordio_writer",
-    "paddle.fluid.regularizer", "paddle.fluid.transpiler"
-]
-
-
-def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc))
-    return hash.hexdigest()
-
-
-def get_module():
-    for fi in files:
-        fi_lib = importlib.import_module(fi)
-        doc_function = getattr(fi_lib, "__all__")
-        for api in doc_function:
-            api_name = fi + "." + api
-            try:
-                doc_module = getattr(eval(api_name), "__doc__")
-            except:
-                pass
-            doc_md5_code = md5(doc_module)
-            doc_dict[api_name] = doc_md5_code
-
-
-def doc_md5_dict(doc_md5_path):
-    with open(doc_md5_path, "rb") as f:
-        doc_md5 = f.read()
-        doc_md5_dict = ast.literal_eval(doc_md5)
-    return doc_md5_dict
-
-
-def check_doc_md5():
-    for k, v in doc_dict.items():
-        try:
-            if doc_ci_dict[k] != v:
-                return doc_dict
-        except:
-            return doc_dict
-    return True
-
-
-if __name__ == "__main__":
-    doc_dict = {}
-    doc_ci_dict = {}
-    doc_md5_file = "/root/.cache/doc_md5.txt"
-    if not os.path.exists(doc_md5_file):
-        os.mknod(doc_md5_file)
-    else:
-        doc_ci_dict = doc_md5_dict(doc_md5_file)
-    get_module()
-    if not os.path.getsize(doc_md5_file):
-        with open(doc_md5_file, 'w') as f:
-            f.write(str(doc_dict))
-        check_dic = True
-        print(check_dic)
-    else:
-        check_dic = check_doc_md5()
-        print(check_dic)
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -26,4 +26,10 @@ for each_diff in result:
        print(each_diff)

 if error:
+    print(
+        '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
+    1. cd ${paddle_path}, compile paddle;
+    2. pip install build/python/dist/(build whl package);
+    3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
+    )
    sys.exit(1)
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -24,12 +24,19 @@ import inspect
 import collections
 import sys
 import pydoc
+import hashlib

 member_dict = collections.OrderedDict()

 experimental_namespace = {"paddle.fluid.imperative"}


+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc).encode('utf-8'))
+    return hash.hexdigest()
+
+
 def visit_member(parent_name, member):
    cur_name = ".".join([parent_name, member.__name__])
    if inspect.isclass(member):
@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
                visit_member(cur_name, value)
    elif callable(member):
        try:
-            member_dict[cur_name] = inspect.getargspec(member)
+            doc = ('document', md5(member.__doc__))
+            args = inspect.getargspec(member)
+            all = (args, doc)
+            member_dict[cur_name] = all
        except TypeError:  # special for PyBind method
            member_dict[cur_name] = "  ".join([
                line.strip() for line in pydoc.render_doc(member).split('\n')

--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -131,7 +131,7 @@ class Timeline(object):
                    if (k, event.device_id, "CPU") not in self._devices:
                        pid = self._allocate_pid()
                        self._devices[(k, event.device_id, "CPU")] = pid
-                        # -1 device id represents CUDA api call
+                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
                        if event.device_id == -1:
                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
                        else: