提交 56b04e5b 编写于 作者: _青葱's avatar _青葱

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into initializer

...@@ -59,7 +59,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -59,7 +59,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto graph = new SSAGraph(); auto graph = new SSAGraph();
SSAGraph &result = *graph; SSAGraph &result = *graph;
std::unordered_set<std::string> og_has_been_broadcast; std::unordered_set<std::string> og_has_been_broadcast;
result.vars_.resize(places_.size());
// We cannot invoke resize. It is a bug of GCC 4.8
result.vars_ = std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
places_.size());
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
...@@ -147,15 +151,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -147,15 +151,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if (vars.empty()) { // This device has no data. continue. if (vars.empty()) { // This device has no data. continue.
continue; continue;
} }
auto *prev_grad = &vars[vars.size() - 1]; auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad); op_handle->AddInput(prev_grad.get());
auto &var = vars[vars.size()]; vars.emplace_back(new VarHandle);
var.place_ = p; auto &var = vars.back();
var.name_ = og; var->place_ = p;
var.version_ = vars.size() - 1; var->name_ = og;
var->version_ = vars.size() - 1;
op_handle->AddOutput(&var); op_handle->AddOutput(var.get());
} }
#else #else
PADDLE_ENFORCE("Not implemented"); PADDLE_ENFORCE("Not implemented");
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <map> #include <map>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
...@@ -24,7 +26,9 @@ namespace framework { ...@@ -24,7 +26,9 @@ namespace framework {
namespace details { namespace details {
struct SSAGraph { struct SSAGraph {
std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_; std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
vars_;
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_; std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
std::vector<std::unique_ptr<OpHandleBase>> ops_; std::vector<std::unique_ptr<OpHandleBase>> ops_;
......
...@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { ...@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
auto it_old = name_pair.second.rbegin(); auto it_old = name_pair.second.rbegin();
++it_old; ++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
auto *write_op = it_new->second.generated_op_; auto *write_op = (*it_new)->generated_op_;
auto &read_ops = it_old->second.pending_ops_; auto &read_ops = (*it_old)->pending_ops_;
for (auto *read_op : read_ops) { for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op; // Manually add a dependency var from read_op to write_op;
...@@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ...@@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
auto &var_holder = var_holders[each_var_name]; auto &var_holder = var_holders[each_var_name];
VarHandle *var = nullptr; VarHandle *var = nullptr;
if (var_holder.empty()) { if (var_holder.empty()) {
var_holder.emplace_back(new VarHandle);
auto &init_var = var_holder[0]; auto &init_var = var_holder[0];
init_var.place_ = place; init_var->place_ = place;
init_var.name_ = each_var_name; init_var->name_ = each_var_name;
init_var.generated_op_ = nullptr; init_var->generated_op_ = nullptr;
init_var.version_ = 0; init_var->version_ = 0;
var = &init_var; var = init_var.get();
} else { } else {
var = &var_holder.rbegin()->second; var = var_holder.rbegin()->get();
} }
return var; return var;
} }
...@@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, ...@@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
size_t place_offset) { size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name]; auto &vars = graph->vars_[place_offset][each_var_name];
size_t version = vars.size(); size_t version = vars.size();
auto &var = vars[version]; vars.emplace_back(new VarHandle());
var.version_ = version; auto &var = vars.back();
var.name_ = each_var_name; var->version_ = version;
var.place_ = place; var->name_ = each_var_name;
op_handle->AddOutput(&var); var->place_ = place;
op_handle->AddOutput(var.get());
} }
template <typename Callback> template <typename Callback>
...@@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) { ...@@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) {
for (auto &each : graph.vars_) { for (auto &each : graph.vars_) {
for (auto &pair1 : each) { for (auto &pair1 : each) {
for (auto &pair2 : pair1.second) { for (auto &pair2 : pair1.second) {
callback(pair2.second); callback(*pair2);
} }
} }
} }
......
...@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->vars_) { for (auto &var_map : graph_->vars_) {
for (auto &name_pair : var_map) { for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) { for (auto &version_pair : name_pair.second) {
InsertPendingVar(version_pair.second); InsertPendingVar(*version_pair);
} }
} }
} }
...@@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->vars_) { for (auto &var_map : graph_->vars_) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
} }
} }
} }
......
...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_helper.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -43,35 +44,35 @@ namespace operators { ...@@ -43,35 +44,35 @@ namespace operators {
*/ */
inline void get_mid_dims(const framework::DDim& x_dims, inline void get_mid_dims(const framework::DDim& x_dims,
const framework::DDim& y_dims, const int axis, const framework::DDim& y_dims, const int axis,
int& pre, int& n, int& post) { int* pre, int* n, int* post) {
pre = 1; *pre = 1;
n = 1; *n = 1;
post = 1; *post = 1;
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
pre *= x_dims[i]; (*pre) *= x_dims[i];
} }
for (int i = 0; i < y_dims.size(); ++i) { for (int i = 0; i < y_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
"Broadcast dimension mismatch."); "Broadcast dimension mismatch.");
n *= y_dims[i]; (*n) *= y_dims[i];
} }
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
post *= x_dims[i]; (*post) *= x_dims[i];
} }
} }
inline void trim_trailing_singular_dims(framework::DDim& dims) { inline void trim_trailing_singular_dims(framework::DDim* dims) {
// Remove trailing dimensions of size 1 for y // Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size(); auto actual_dims_size = dims->size();
for (; actual_dims_size != 0; --actual_dims_size) { for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break; if ((*dims)[actual_dims_size - 1] != 1) break;
} }
if (actual_dims_size != dims.size()) { if (actual_dims_size != dims->size()) {
auto actual_dims = framework::vectorize(dims); auto actual_dims = framework::vectorize(*dims);
actual_dims.resize(actual_dims_size); actual_dims.resize(actual_dims_size);
dims = framework::make_ddim(actual_dims); *dims = framework::make_ddim(actual_dims);
} }
} }
...@@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext> ...@@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t; super_t;
HOSTDEVICE RowwiseTransformIterator(const T* x, int n) HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
: super_t(x), begin_(x), n_(n){}; : super_t(x), begin_(x), n_(n) {}
friend class thrust::iterator_core_access; friend class thrust::iterator_core_access;
private: private:
...@@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext> ...@@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t; super_t;
HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
: super_t(x), begin_(x), n_(n), post_(post){}; : super_t(x), begin_(x), n_(n), post_(post) {}
friend class thrust::iterator_core_access; friend class thrust::iterator_core_access;
private: private:
...@@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
} }
#ifdef __NVCC__ #ifdef __NVCC__
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}
return val;
}
template <typename T, typename DX_OP, typename DY_OP> template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel( static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w, const T* x, const T* y, const T* out, const T* dout, int h, int w,
...@@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
if (dy) { if (dy) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = platform::reduceSum(val, tid, h); val = reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
if (dy) { if (dy) {
int h = pre * post; int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = platform::reduceSum(val, tid, h); val = reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
auto y_dim = y.dims(); auto y_dim = y.dims();
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
trim_trailing_singular_dims(y_dim); trim_trailing_singular_dims(&y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis; axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, pre, n, post); get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
int h = pre; int h = pre;
int w = n; int w = n;
...@@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
} }
} }
} }
}; }
template <typename DeviceContext, typename T, typename functor, template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor> typename broadcastfunctor, typename broadcast2functor>
...@@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
} }
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(y_dims); trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post); get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
broadcastfunctor f; broadcastfunctor f;
...@@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, ...@@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(y_dims); trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post); get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
functor.RunRowWise(n, pre); functor.RunRowWise(n, pre);
return; return;
......
...@@ -62,53 +62,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -62,53 +62,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}
return val;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -278,11 +278,21 @@ class DistributeTranspiler: ...@@ -278,11 +278,21 @@ class DistributeTranspiler:
# we don't need to create them when grad arrives. # we don't need to create them when grad arrives.
# change client side var name to origin name by # change client side var name to origin name by
# removing ".trainer_%d" suffix # removing ".trainer_%d" suffix
suff_idx = v.name.find(".trainer_") suff_idx = v.name.find(".trainer_")
if suff_idx >= 0: if suff_idx >= 0:
orig_var_name = v.name[:suff_idx] orig_var_name = v.name[:suff_idx]
else: else:
orig_var_name = v.name orig_var_name = v.name
# NOTE: single_trainer_var must be created for multi-trainer
# case to merge grads from multiple trainers
single_trainer_var = \
pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
if self.trainers > 1: if self.trainers > 1:
for trainer_id in xrange(self.trainers): for trainer_id in xrange(self.trainers):
var = pserver_program.global_block().create_var( var = pserver_program.global_block().create_var(
...@@ -293,12 +303,6 @@ class DistributeTranspiler: ...@@ -293,12 +303,6 @@ class DistributeTranspiler:
shape=v.shape) shape=v.shape)
recv_inputs.append(var) recv_inputs.append(var)
else: else:
single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
recv_inputs.append(single_trainer_var) recv_inputs.append(single_trainer_var)
# step3 # step3
......
...@@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': ...@@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
package_data['py_paddle']=['*.py','_swig_paddle.so'] package_data['py_paddle']=['*.py','_swig_paddle.so']
package_dir={ package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}', '': '${PADDLE_BINARY_DIR}/python',
# The paddle.fluid.proto will be generated while compiling. # The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory. # So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册