提交 aca9180a 编写于 作者: C chengduoZH

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/fix_concat

...@@ -22,9 +22,10 @@ limitations under the License. */ ...@@ -22,9 +22,10 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/detail/grpc_service.h"
#include "paddle/fluid/operators/detail/grpc_service.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -199,9 +199,9 @@ TEST(LodTensor, Run) { ...@@ -199,9 +199,9 @@ TEST(LodTensor, Run) {
RunTestLodTensor(place); RunTestLodTensor(place);
RunTestLodTensor(place, 1); RunTestLodTensor(place, 1);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place; platform::CUDAPlace gpu(0);
RunTestLodTensor(place); RunTestLodTensor(gpu);
RunTestLodTensor(place, 1); RunTestLodTensor(gpu, 1);
#endif #endif
} }
...@@ -210,7 +210,7 @@ TEST(SelectedRows, Run) { ...@@ -210,7 +210,7 @@ TEST(SelectedRows, Run) {
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(place);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place; platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(gpu);
#endif #endif
} }
...@@ -93,12 +93,6 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -93,12 +93,6 @@ class ListenAndServOp : public framework::OperatorBase {
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::vector<framework::ExecutorPrepareContext *> blk_ctx_list;
blk_ctx_list.push_back(nullptr); // block0 is not used.
for (int blkid = 1; blkid < num_blocks; ++blkid) {
auto *exe_ctx = executor.Prepare(*program, blkid);
blk_ctx_list.push_back(exe_ctx);
}
// TODO(typhoonzero): change this to a while_op for every cluster-batch. // TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false; bool exit_flag = false;
...@@ -149,12 +143,11 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -149,12 +143,11 @@ class ListenAndServOp : public framework::OperatorBase {
std::vector<std::future<void>> fs; std::vector<std::future<void>> fs;
// block0 contains only listen_and_serv op, start run from block1. // block0 contains only listen_and_serv op, start run from block1.
for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
fs.push_back(framework::Async( fs.push_back(
[&executor, &program, &recv_scope, &blk_ctx_list, blkid]() { framework::Async([&executor, &program, &recv_scope, blkid]() {
int run_block = blkid; // thread local int run_block = blkid; // thread local
try { try {
executor.RunPreparedContext(blk_ctx_list[run_block], executor.Run(*program, &recv_scope, run_block, false, false);
&recv_scope, false, false);
} catch (std::exception &e) { } catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
} }
...@@ -164,8 +157,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -164,8 +157,7 @@ class ListenAndServOp : public framework::OperatorBase {
// Run global block at final step, or block1 if there are only 2 blocks // Run global block at final step, or block1 if there are only 2 blocks
if (num_blocks >= 2) { if (num_blocks >= 2) {
try { try {
executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope, executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
false, false);
} catch (std::exception &e) { } catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
} }
...@@ -185,9 +177,9 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -185,9 +177,9 @@ class ListenAndServOp : public framework::OperatorBase {
sparse_vars.clear(); sparse_vars.clear();
} // while(true) } // while(true)
for (int i = 0; i < num_blocks; ++i) { // for (int i = 0; i < num_blocks; ++i) {
delete blk_ctx_list[i]; // delete blk_ctx_list[i];
} // }
} }
protected: protected:
......
...@@ -20,7 +20,7 @@ namespace math { ...@@ -20,7 +20,7 @@ namespace math {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> { class ConcatFunctor<platform::CPUDeviceContext, T> {
...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> { ...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class ConcatGradFunctor<platform::CPUDeviceContext, T> {
......
...@@ -125,7 +125,7 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -125,7 +125,7 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> { class ConcatFunctor<platform::CUDADeviceContext, T> {
...@@ -195,7 +195,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -195,7 +195,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <future> #include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase { ...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
auto client_var_name = Output("RPCClient"); auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.", "Can not find variable '%s' in the scope.",
......
...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ ...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册