diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 128a5344fbb8c64c36ade24475bd0d99bdb3e0f5..e7a0cb678ebfd8a3fe5f873e995b63b0857e5ba4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -59,7 +59,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( auto graph = new SSAGraph(); SSAGraph &result = *graph; std::unordered_set og_has_been_broadcast; - result.vars_.resize(places_.size()); + + // We cannot invoke resize. It is a bug of GCC 4.8 + result.vars_ = std::vector< + std::unordered_map>>>( + places_.size()); bool is_forwarding = true; for (auto *op : program.Block(0).AllOps()) { @@ -147,15 +151,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (vars.empty()) { // This device has no data. continue. continue; } - auto *prev_grad = &vars[vars.size() - 1]; - op_handle->AddInput(prev_grad); + auto &prev_grad = vars[vars.size() - 1]; + op_handle->AddInput(prev_grad.get()); - auto &var = vars[vars.size()]; - var.place_ = p; - var.name_ = og; - var.version_ = vars.size() - 1; + vars.emplace_back(new VarHandle); + auto &var = vars.back(); + var->place_ = p; + var->name_ = og; + var->version_ = vars.size() - 1; - op_handle->AddOutput(&var); + op_handle->AddOutput(var.get()); } #else PADDLE_ENFORCE("Not implemented"); diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h index ac3e2d86993aee31b79f4481c4d5a47cd9cdf5b4..72684e7f97f1324d0efba960903cf9f2acb618a4 100644 --- a/paddle/fluid/framework/details/ssa_graph.h +++ b/paddle/fluid/framework/details/ssa_graph.h @@ -16,6 +16,8 @@ #include #include +#include + #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -24,7 +26,9 @@ namespace framework { namespace details { struct SSAGraph { - std::vector>> vars_; + std::vector< + std::unordered_map>>> + vars_; // aux variables to represent dependency. Useful to resolve data hazard. std::unordered_set> dep_vars_; std::vector> ops_; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 0a4febd22f3feefdcac99cafc2cb58269380d192..be5fb7577581fd99b1b7b80ccdd2acb8d3a91f01 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { auto it_old = name_pair.second.rbegin(); ++it_old; for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - auto *write_op = it_new->second.generated_op_; - auto &read_ops = it_old->second.pending_ops_; + auto *write_op = (*it_new)->generated_op_; + auto &read_ops = (*it_old)->pending_ops_; for (auto *read_op : read_ops) { // Manually add a dependency var from read_op to write_op; @@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( auto &var_holder = var_holders[each_var_name]; VarHandle *var = nullptr; if (var_holder.empty()) { + var_holder.emplace_back(new VarHandle); auto &init_var = var_holder[0]; - init_var.place_ = place; - init_var.name_ = each_var_name; - init_var.generated_op_ = nullptr; - init_var.version_ = 0; - var = &init_var; + init_var->place_ = place; + init_var->name_ = each_var_name; + init_var->generated_op_ = nullptr; + init_var->version_ = 0; + var = init_var.get(); } else { - var = &var_holder.rbegin()->second; + var = var_holder.rbegin()->get(); } return var; } @@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, size_t place_offset) { auto &vars = graph->vars_[place_offset][each_var_name]; size_t version = vars.size(); - auto &var = vars[version]; - var.version_ = version; - var.name_ = each_var_name; - var.place_ = place; - op_handle->AddOutput(&var); + vars.emplace_back(new VarHandle()); + auto &var = vars.back(); + var->version_ = version; + var->name_ = each_var_name; + var->place_ = place; + op_handle->AddOutput(var.get()); } template @@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) { for (auto &each : graph.vars_) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { - callback(pair2.second); + callback(*pair2); } } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 596e5731868630cebc3cf51b2e78d4deb39a9b33..62af4c1d79ded5eaa30e4e6d43cc0d7327ae9689 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &var_map : graph_->vars_) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { - InsertPendingVar(version_pair.second); + InsertPendingVar(*version_pair); } } } @@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &var_map : graph_->vars_) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); + fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); } } } diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 0b4238436ffcc586fe8bc7abbe4cfbc1654dcb88..415182201a7a9e11d8ea8c62b92849b5ea3bac3e 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -13,14 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/transform.h" #ifdef __NVCC__ +#include #include -#include "paddle/fluid/platform/cuda_helper.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif @@ -43,35 +44,35 @@ namespace operators { */ inline void get_mid_dims(const framework::DDim& x_dims, const framework::DDim& y_dims, const int axis, - int& pre, int& n, int& post) { - pre = 1; - n = 1; - post = 1; + int* pre, int* n, int* post) { + *pre = 1; + *n = 1; + *post = 1; for (int i = 0; i < axis; ++i) { - pre *= x_dims[i]; + (*pre) *= x_dims[i]; } for (int i = 0; i < y_dims.size(); ++i) { PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch."); - n *= y_dims[i]; + (*n) *= y_dims[i]; } for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - post *= x_dims[i]; + (*post) *= x_dims[i]; } } -inline void trim_trailing_singular_dims(framework::DDim& dims) { +inline void trim_trailing_singular_dims(framework::DDim* dims) { // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims.size(); + auto actual_dims_size = dims->size(); for (; actual_dims_size != 0; --actual_dims_size) { - if (dims[actual_dims_size - 1] != 1) break; + if ((*dims)[actual_dims_size - 1] != 1) break; } - if (actual_dims_size != dims.size()) { - auto actual_dims = framework::vectorize(dims); + if (actual_dims_size != dims->size()) { + auto actual_dims = framework::vectorize(*dims); actual_dims.resize(actual_dims_size); - dims = framework::make_ddim(actual_dims); + *dims = framework::make_ddim(actual_dims); } } @@ -159,7 +160,7 @@ class RowwiseTransformIterator RowwiseTransformIterator, const T*> super_t; HOSTDEVICE RowwiseTransformIterator(const T* x, int n) - : super_t(x), begin_(x), n_(n){}; + : super_t(x), begin_(x), n_(n) {} friend class thrust::iterator_core_access; private: @@ -179,7 +180,7 @@ class MidWiseTransformIterator MidWiseTransformIterator, const T*> super_t; HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) - : super_t(x), begin_(x), n_(n), post_(post){}; + : super_t(x), begin_(x), n_(n), post_(post) {} friend class thrust::iterator_core_access; private: @@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, } } #ifdef __NVCC__ + +// __shfl_down has been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { + return __shfl_down(val, delta); +} +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + +template +__device__ T reduceSum(T val, int tid, int len) { + // TODO(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + __shared__ T shm[32]; + const int warpSize = 32; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + } + + return val; +} + template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int h, int w, @@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = platform::reduceSum(val, tid, h); + val = reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } @@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( if (dy) { int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = platform::reduceSum(val, tid, h); + val = reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } @@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, auto y_dim = y.dims(); axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - trim_trailing_singular_dims(y_dim); + trim_trailing_singular_dims(&y_dim); axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, pre, n, post); + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); if (post == 1) { int h = pre; int w = n; @@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, } } } -}; +} template @@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, } axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - trim_trailing_singular_dims(y_dims); + trim_trailing_singular_dims(&y_dims); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, pre, n, post); + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); if (post == 1) { broadcastfunctor f; @@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(y_dims); + trim_trailing_singular_dims(&y_dims); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, pre, n, post); + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); if (post == 1) { functor.RunRowWise(n, pre); return; diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index a4ea4f21e3c16c9292cf67863616924e9d9f8aba..881d611d4ac26f992036f639097815aff625227b 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -62,53 +62,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { } #endif -// __shfl_down has been deprecated as of CUDA 9.0. -#if CUDA_VERSION < 9000 -template -__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { - return __shfl_down(val, delta); -} -#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; -#else -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -#endif - -template -__device__ T reduceSum(T val, int tid, int len) { - // TODO(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - __shared__ T shm[32]; - const int warpSize = 32; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += __shfl_down_sync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += __shfl_down_sync(mask, val, offset); - } - - return val; -} - } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 3c6be913200716ae4f70e2b48ee8faf8078223d2..0ec3ebc7e3dba6e4cf89c8a76622761d210276cf 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -278,11 +278,21 @@ class DistributeTranspiler: # we don't need to create them when grad arrives. # change client side var name to origin name by # removing ".trainer_%d" suffix + suff_idx = v.name.find(".trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] else: orig_var_name = v.name + # NOTE: single_trainer_var must be created for multi-trainer + # case to merge grads from multiple trainers + single_trainer_var = \ + pserver_program.global_block().create_var( + name=orig_var_name, + persistable=True, + type=v.type, + dtype=v.dtype, + shape=v.shape) if self.trainers > 1: for trainer_id in xrange(self.trainers): var = pserver_program.global_block().create_var( @@ -293,12 +303,6 @@ class DistributeTranspiler: shape=v.shape) recv_inputs.append(var) else: - single_trainer_var = pserver_program.global_block().create_var( - name=orig_var_name, - persistable=True, - type=v.type, - dtype=v.dtype, - shape=v.shape) recv_inputs.append(single_trainer_var) # step3 diff --git a/python/setup.py.in b/python/setup.py.in index 5e7096e225e08d19e89051603bbc07eff945c78a..a811b509a90b8b0d84451f54462a0308c062d022 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': package_data['py_paddle']=['*.py','_swig_paddle.so'] package_dir={ - '': '${CMAKE_CURRENT_SOURCE_DIR}', + '': '${PADDLE_BINARY_DIR}/python', # The paddle.fluid.proto will be generated while compiling. # So that package points to other directory. 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',