Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/fix_concat

aca9180a · chengduoZH · 750aff10 · cffe1a91 · aca9180a · aca9180a
7 changed file
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -22,9 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -199,9 +199,9 @@ TEST(LodTensor, Run) {
  RunTestLodTensor(place);
  RunTestLodTensor(place, 1);
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace place;
+  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(place);
+  RunTestLodTensor(gpu);
-  RunTestLodTensor(place, 1);
+  RunTestLodTensor(gpu, 1);
 #endif
 }
@@ -210,7 +210,7 @@ TEST(SelectedRows, Run) {
  RunSerdeTestSelectedRows(place);
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace place;
+  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(place);
+  RunSerdeTestSelectedRows(gpu);
 #endif
 }
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -93,12 +93,6 @@ class ListenAndServOp : public framework::OperatorBase {
                      "server program should have at least 2 blocks");
    framework::Executor executor(dev_place);
-    std::vector<framework::ExecutorPrepareContext *> blk_ctx_list;
-    blk_ctx_list.push_back(nullptr);  // block0 is not used.
-    for (int blkid = 1; blkid < num_blocks; ++blkid) {
-      auto *exe_ctx = executor.Prepare(*program, blkid);
-      blk_ctx_list.push_back(exe_ctx);
-    }
    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
    bool exit_flag = false;
@@ -149,12 +143,11 @@ class ListenAndServOp : public framework::OperatorBase {
      std::vector<std::future<void>> fs;
      // block0 contains only listen_and_serv op, start run from block1.
      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(framework::Async(
+        fs.push_back(
-            [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() {
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
              int run_block = blkid;  // thread local
              try {
-                executor.RunPreparedContext(blk_ctx_list[run_block],
+                executor.Run(*program, &recv_scope, run_block, false, false);
-                                            &recv_scope, false, false);
              } catch (std::exception &e) {
                LOG(ERROR) << "run sub program error " << e.what();
              }
@@ -164,8 +157,7 @@ class ListenAndServOp : public framework::OperatorBase {
      // Run global block at final step, or block1 if there are only 2 blocks
      if (num_blocks >= 2) {
        try {
-          executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope,
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
-                                      false, false);
        } catch (std::exception &e) {
          LOG(ERROR) << "run sub program error " << e.what();
        }
@@ -185,9 +177,9 @@ class ListenAndServOp : public framework::OperatorBase {
      sparse_vars.clear();
    }  // while(true)
-    for (int i = 0; i < num_blocks; ++i) {
+    // for (int i = 0; i < num_blocks; ++i) {
-      delete blk_ctx_list[i];
+    //   delete blk_ctx_list[i];
-    }
+    // }
  }
 protected:

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -20,7 +20,7 @@ namespace math {
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -125,7 +125,7 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -195,7 +195,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 /*
 * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
    auto client_var_name = Output("RPCClient");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
                            "Can not find variable '%s' in the scope.",

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)