提交 d080d3e6 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into timeline-support-pure-cpu

...@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ...@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end())); std::vector<Node *>(outputs.begin(), outputs.end()));
} }
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.type() == Node::Type::kValue || node.deleted()) {
continue;
}
op_nodes.push_back(&node);
}
size_t op_num = op_nodes.size();
for (size_t i = 0; i < op_num; i++) {
if (op_nodes[i]->type() == Node::Type::kFunction) continue;
std::unordered_set<std::string> follow_up_input_names;
for (size_t j = i + 1; j < op_num; j++) {
for (auto *in : op_nodes[j]->inlinks) {
follow_up_input_names.insert(in->name());
}
}
std::vector<Node *> filtered_subgraph_outlinks;
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> {
std::pair<std::vector<Node *>, std::vector<Node *>> std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ...@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; } bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
FilterRedundantOutputOfSubGraph(graph);
LOG(INFO) << "graph.inputs " << graph->inputs.size(); LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue; if (node.deleted()) continue;
......
...@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { ...@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
for (size_t i = 0; i < graph->nodes.size(); i++) { for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i); const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue; if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) { for (auto &out : node.outlinks) {
if (!config_.display_deleted_node && in->deleted()) continue; if (!config_.display_deleted_node && out->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {}); dot.AddEdge(node.repr(), out->repr(), {});
} }
} }
return dot.Build(); return dot.Build();
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
...@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) ...@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
if (!other.memory_owned_) {
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
} else {
Resize(other.length());
memcpy(data_, other.data(), other.length());
length_ = other.length();
memory_owned_ = true;
}
return *this;
}
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
// only the buffer with external memory can be copied // only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
memory_owned_ = other.memory_owned_; memory_owned_ = other.memory_owned_;
other.data_ = nullptr;
other.length_ = 0;
other.memory_owned_ = false;
return *this; return *this;
} }
void PaddleBuf::Resize(size_t length) { void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed. // Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return; if (length_ == length) return;
assert(memory_owned_); if (memory_owned_) {
Free(); Free();
}
data_ = new char[length]; data_ = new char[length];
length_ = length; length_ = length;
memory_owned_ = true; memory_owned_ = true;
...@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) { ...@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
assert(length_ > 0); assert(length_ > 0);
delete static_cast<char*>(data_); delete[] static_cast<char*>(data_);
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }
......
...@@ -40,11 +40,12 @@ class PaddleBuf { ...@@ -40,11 +40,12 @@ class PaddleBuf {
// Copy only available when memory is managed externally. // Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&); explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&); PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory. // Do not own the memory.
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
explicit PaddleBuf(size_t length) PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
......
...@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor& dout, int axis, const framework::Tensor& dout, int axis,
framework::Tensor* dx, framework::Tensor* dy, framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) { DX_OP dx_op, DY_OP dy_op) {
const framework::DDim x_dim = x.dims(); const framework::DDim& x_dim = x.dims();
const framework::DDim y_dim = y.dims(); const framework::DDim& y_dim = y.dims();
if (x.dims() == y.dims()) { if (x.dims() == y.dims()) {
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
...@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, ...@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
framework::Tensor* dx, framework::Tensor* dy, framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) { DX_OP dx_op, DY_OP dy_op) {
if (dy == nullptr) { if (dy == nullptr) {
const framework::DDim dx_dims = dout.dims(); const framework::DDim& dx_dims = dout.dims();
auto dy_dims = dx_dims; auto dy_dims = dx_dims;
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else { } else {
if (dout.dims() == dy->dims()) { if (dout.dims() == dy->dims()) {
const framework::DDim dx_dims = dout.dims(); const framework::DDim& dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims(); const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else { // Y is a scalar } else { // Y is a scalar
auto dx_dims = dout.dims(); auto dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims(); const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <cub/cub.cuh>
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
namespace paddle { namespace paddle {
...@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad, ...@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]); logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
} }
} }
} // namespace } // namespace
static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
static __device__ __forceinline__ float real_log(float x) {
return math::TolerableValue<float>()(logf(x));
}
static __device__ __forceinline__ double real_log(double x) {
return math::TolerableValue<double>()(log(x));
}
/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
* and loss **/
/*
Supposing the x is `logits` and y is `labels`, the equations are as
followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j}
where:
max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i.
In this way, the 3 steps should be changed to:
Step 1 (RowReductionForMax): row-wise operation to calculate max_i
Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
*/
// There are 3 kinds of reduce algorithms in cub:
// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
// BLOCK_REDUCE_RAKING
// BLOCK_REDUCE_WARP_REDUCTIONS (default)
template <typename T, int BlockDim>
using BlockReduce =
cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
template <typename T, int BlockDim>
using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
// Make sure that BlockDim <= feature_size
// This kernel is used to calculate the max element of each row
template <typename T, int BlockDim>
__global__ void RowReductionForMax(const T* logits_data, T* max_data,
int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
T cur_max = logits_data[beg_idx];
beg_idx += BlockDim;
while (beg_idx < end_idx) {
if (cur_max < logits_data[beg_idx]) {
cur_max = logits_data[beg_idx];
}
beg_idx += BlockDim;
}
cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
if (threadIdx.x == 0) {
max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
}
}
// Make sure that BlockDim <= feature_size
template <typename T, int BlockDim>
__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
T* softmax, int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
auto block_max = max_data[blockIdx.x];
softmax[beg_idx] = logits_data[beg_idx] - block_max;
T diff_max_sum = real_exp(softmax[beg_idx]);
beg_idx += BlockDim;
while (beg_idx < end_idx) {
softmax[beg_idx] = logits_data[beg_idx] - block_max;
diff_max_sum += real_exp(softmax[beg_idx]);
beg_idx += BlockDim;
}
diff_max_sum =
BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
}
// Make sure that BlockDim <= feature_size
template <typename T, int BlockDim>
__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
const T* labels_data,
T* loss_data, T* softmax,
int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
// log_diff_max_sum shares memory with loss
auto block_log_diff_max_sum = loss_data[blockIdx.x];
auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp);
auto loss = -labels_data[beg_idx] * tmp;
beg_idx += BlockDim;
while (beg_idx < end_idx) {
tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp);
loss -= (labels_data[beg_idx] * tmp);
beg_idx += BlockDim;
}
loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
}
template <typename T>
__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
auto idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < batch_size) out[idx] = static_cast<T>(1);
}
template <typename T>
static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
const T* labels_data,
T* softmax_data, T* loss_data,
int batch_size, int feature_size,
cudaStream_t stream) {
constexpr int kMaxBlockDim = 512;
int block_dim = feature_size >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(feature_size)));
#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim) \
case BlockDim: \
RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, feature_size); \
RowReductionForDiffMaxSum<T, \
BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, softmax_data, feature_size); \
RowReductionForSoftmaxAndCrossEntropy< \
T, BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, labels_data, loss_data, softmax_data, feature_size); \
break
switch (block_dim) {
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
case 1:
SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
kMaxBlockDim,
kMaxBlockDim, 0, stream>>>(
softmax_data, batch_size);
cudaMemsetAsync(loss_data, 0, batch_size, stream);
break;
default:
PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
break;
}
#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
}
template <typename T> template <typename T>
class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> { class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> { ...@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
Tensor* softmax = context.Output<Tensor>("Softmax"); Tensor* softmax = context.Output<Tensor>("Softmax");
Tensor* loss = context.Output<Tensor>("Loss"); Tensor* loss = context.Output<Tensor>("Loss");
softmax->mutable_data<T>(context.GetPlace()); auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
loss->mutable_data<T>(context.GetPlace()); auto* loss_data = loss->mutable_data<T>(context.GetPlace());
math::SoftmaxFunctor<platform::CUDADeviceContext, T>()( auto soft_label = context.Attr<bool>("soft_label");
context.cuda_device_context(), logits, softmax); if (soft_label) {
int batch_size = logits->dims()[0];
int feature_size = logits->dims()[1];
auto* logits_data = logits->data<T>();
auto* labels_data = labels->data<T>();
SoftmaxWithCrossEntropyFusedKernel(
logits_data, labels_data, softmax_data, loss_data, batch_size,
feature_size, context.cuda_device_context().stream());
} else {
math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
softmax);
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()( math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
context.cuda_device_context(), loss, softmax, labels, context.cuda_device_context(), loss, softmax, labels, false);
context.Attr<bool>("soft_label")); }
} }
}; };
......
...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] ...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0 UNK_IDX = 0
......
...@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' ...@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz') 'wmt_shrinked_data/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92 # BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>" START = "<s>"
......
...@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase): ...@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase):
self.origin_prog = main.clone() self.origin_prog = main.clone()
return main return main
def get_trainer(self, config=None): def get_trainer(self, config=None, sync_mode=True):
t = self._transpiler_instance(config) t = self._transpiler_instance(config, sync_mode)
return t.get_trainer_program() return t.get_trainer_program()
def get_pserver(self, ep, config=None): def get_pserver(self, ep, config=None, sync_mode=True):
t = self._transpiler_instance(config) t = self._transpiler_instance(config, sync_mode)
pserver = t.get_pserver_program(ep) pserver = t.get_pserver_program(ep)
startup = t.get_startup_program(ep, pserver) startup = t.get_startup_program(ep, pserver)
return pserver, startup return pserver, startup
def _transpiler_instance(self, config=None): def _transpiler_instance(self, config=None, sync_mode=True):
if not self.transpiler: if not self.transpiler:
main = self.get_main_program() main = self.get_main_program()
self.transpiler = fluid.DistributeTranspiler(config=config) self.transpiler = fluid.DistributeTranspiler(config=config)
...@@ -69,7 +69,8 @@ class TranspilerTest(unittest.TestCase): ...@@ -69,7 +69,8 @@ class TranspilerTest(unittest.TestCase):
self.trainer_id, self.trainer_id,
program=main, program=main,
pservers=self.pserver_eps, pservers=self.pserver_eps,
trainers=self.trainers) trainers=self.trainers,
sync_mode=sync_mode)
return self.transpiler return self.transpiler
...@@ -464,5 +465,76 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -464,5 +465,76 @@ class TestDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestAsyncLocalLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 3)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"])
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"])
trainer = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
'fill_constant', 'mean_grad', 'cross_entropy_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
'recv', 'recv', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestAsyncDistLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=True)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 6)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
# 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["lookup_sparse_table"])
# 4 prefetch -> lookup_sparse_table for data1
self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"])
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_ids', 'send', 'recv', 'recv'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -293,6 +293,7 @@ class DistributeTranspiler(object): ...@@ -293,6 +293,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
}) })
if self.sync_mode:
program.global_block().append_op( program.global_block().append_op(
type="fetch_barrier", type="fetch_barrier",
inputs={}, inputs={},
......
...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] ...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0 UNK_IDX = 0
......
...@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' ...@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz') 'wmt_shrinked_data/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92 # BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>" START = "<s>"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册