提交 f3badacd 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fea/jit/gen

...@@ -68,7 +68,6 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d ...@@ -68,7 +68,6 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
......
...@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY) ...@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
endif() endif()
add_subdirectory(testing) add_subdirectory(testing)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API) if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
add_subdirectory(fluid) add_subdirectory(fluid)
endif() endif()
...@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', ' ...@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
...@@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k ...@@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
......
...@@ -9,8 +9,6 @@ add_subdirectory(pybind) ...@@ -9,8 +9,6 @@ add_subdirectory(pybind)
add_subdirectory(recordio) add_subdirectory(recordio)
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last.
# NOTE: please add subdirectory inference at last. add_subdirectory(inference)
add_subdirectory(inference) add_subdirectory(train)
add_subdirectory(train)
endif()
...@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_(ctxs) { nccl_ctxs_(ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p : places_) { for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
} }
} }
} }
...@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
...@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { ...@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>(); *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i]; auto &p = places_[i];
auto *var = scope.FindVar(out_var_handles[i]->name_); auto *var = scope.FindVar(out_var_handles[i]->name_);
auto *dev_ctx = dev_ctxes_[p]; auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>(); auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
......
...@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) { nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) { for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
} }
} }
} }
......
...@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() { ...@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait = bool need_wait =
in_var && in_var->GeneratedOp() && in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_]; in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait; return need_wait;
} }
......
...@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle( ...@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) { : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) { if (ctxs) {
for (auto &p : places_) { for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p); this->SetDeviceContext(p, ctxs->DevCtx(p));
} }
} }
} }
...@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() { ...@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1, PADDLE_ENFORCE_GT(places_.size(), 1,
"Data balance can only be enabled when the number of " "Data balance can only be enabled when the number of "
"places to run larger than 1."); "places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
......
...@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() { ...@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one."); "The number of output should be one.");
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
...@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() { ...@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor *out_tensor = out_value->mutable_value(); Tensor *out_tensor = out_value->mutable_value();
// copy // copy
auto dev_ctx = dev_ctxes_[out_var_handle->place_]; auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx, RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
t_out_p] { t_out_p] {
int s = 0, e = 0; int s = 0, e = 0;
......
...@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) { for (auto *in : inputs_) {
if (NeedWait(in)) { if (NeedWait(in)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]); in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
} }
} }
} }
......
...@@ -27,7 +27,7 @@ namespace framework { ...@@ -27,7 +27,7 @@ namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
......
...@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) { nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) { for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
} }
} }
} }
......
...@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() { ...@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue; continue;
} }
if (in->GeneratedOp()) { if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]); in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
} }
} }
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
...@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, ...@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_(static_cast<float>(1.0 / num_dev)), coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope), scope_(scope),
place_(place) { place_(place) {
dev_ctxes_[place_] = dev_ctx; this->SetDeviceContext(place_, dev_ctx);
} }
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
...@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
auto stream = auto stream = static_cast<platform::CUDADeviceContext *>(
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_]) this->dev_ctxes_.at(place_))
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
......
...@@ -19,81 +19,7 @@ limitations under the License. */ ...@@ -19,81 +19,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using LoDTensorArray = std::vector<LoDTensor>; using LoDTensorArray = std::vector<LoDTensor>;
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class LoDTensorArray {
public:
LoDTensorArray() = default;
using iterator = std::vector<LoDTensor>::iterator;
using const_iterator = std::vector<LoDTensor>::const_iterator;
const_iterator begin() const { return array_.begin(); }
const_iterator end() const { return array_.begin() + size_; }
iterator begin() { return array_.begin(); }
iterator end() { return array_.begin() + size_; }
void push_back(const LoDTensor& x) {
if (size_ < array_.size()) {
array_[size_++] = x;
} else {
array_.push_back(x);
++size_;
}
}
void resize(size_t size) {
if (array_.size() < size) {
array_.resize(size);
}
size_ = size;
}
void emplace_back() { array_.emplace_back(); }
void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
LoDTensor& back() { return array_.back(); }
size_t space() const { return array_.size(); }
void reserve(size_t size) {
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if (size > 800UL) {
LOG(WARNING) << "TensorArray has more than 800 items";
}
array_.reserve(size);
}
bool empty() const { return size_ == 0UL; }
void clear() { size_ = 0UL; }
LoDTensor& operator[](size_t id) { return array_[id]; }
const LoDTensor& operator[](size_t id) const { return array_[id]; }
LoDTensor& at(size_t id) { return array_.at(id); }
const LoDTensor& at(size_t id) const { return array_.at(id); }
size_t size() const { return size_; }
private:
size_t size_{0};
std::vector<LoDTensor> array_;
};
#endif // !PADDLE_ON_INFERENCE
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -358,7 +358,7 @@ static bool VarIsTensor(const Variable* var) { ...@@ -358,7 +358,7 @@ static bool VarIsTensor(const Variable* var) {
return var->IsType<LoDTensor>() || var->IsType<SelectedRows>(); return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
} }
static const Tensor* GetTensorFromVar(Variable* var) { const Tensor* GetTensorFromVar(Variable* var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>(); return var->GetMutable<LoDTensor>();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
......
...@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) { ...@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
} }
proto::VarType::Type GetDataTypeOfVar(const Variable* var); proto::VarType::Type GetDataTypeOfVar(const Variable* var);
const Tensor* GetTensorFromVar(Variable* var);
class OperatorBase; class OperatorBase;
class ExecutionContext; class ExecutionContext;
......
...@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
} }
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
const auto dev_ctxs = for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().GetAllDeviceContexts(); platform::DeviceContextPool::Instance().Get(p)->Wait();
for (auto &dev_ctx : dev_ctxs) {
dev_ctx->Wait();
} }
if (member_->own_local_scope_) { if (member_->own_local_scope_) {
......
...@@ -61,8 +61,6 @@ cc_test(test_paddle_inference_api ...@@ -61,8 +61,6 @@ cc_test(test_paddle_inference_api
inference_api_test(test_api_impl SRC api_impl_tester.cc inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book) ARGS --dirname=${PYTHON_TESTS_DIR}/book)
......
...@@ -22,9 +22,9 @@ limitations under the License. */ ...@@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__ #ifdef __clang__
#define ACC_DIFF 4e-3 #define ACC_DIFF 4e-2
#else #else
#define ACC_DIFF 1e-3 #define ACC_DIFF 1e-2
#endif #endif
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -187,7 +187,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -187,7 +187,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) { for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone(); auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid]; auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs; std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
...@@ -245,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -245,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) { for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone(); auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid]; auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs; std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
......
...@@ -70,8 +70,12 @@ void Main(bool use_gpu) { ...@@ -70,8 +70,12 @@ void Main(bool use_gpu) {
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements); for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
i++) { i++) {
CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i], // Here will result random fail, for that the model is trained by CI, the
0.001); // train phase is not stable, so the result will be random.
// TODO(Superjomn) will restore after the model is upload.
// CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
// result[i],
// 0.001);
} }
} }
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/add_position_encoding_op.h"
namespace paddle {
namespace operators {
class AddPositionEncodingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"X(Input) of add_position_encoding_op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Out(Output) of add_position_encoding_op should not be null.");
auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Out@GRAD must not be null.");
auto out_dims = ctx->GetInputDim("Out");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
}
}
};
class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of AddPositionEncoding operator");
AddOutput("Out", "Output of AddPositionEncoding operator");
AddAttr<float>("alpha", "The scale of Original Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& alpha) {
PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
});
AddAttr<float>("beta", "The scale of Position Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& beta) {
PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
});
AddComment(R"DOC(
Add Position Encoding Operator.
The add position encoding calculates the output based on the input, alpha, beta.
The size of each dimension of the parameters checked in the infer-shape.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plt = paddle::platform;
REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
ops::AddPositionEncodingOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
REGISTER_OP_CPU_KERNEL(
add_position_encoding,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
add_position_encoding_grad,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AddPositionEncodingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::LoDTensor>("X");
auto& x_lod = X->lod();
auto* src_ptr = X->data<T>();
auto* Out = context.Output<framework::LoDTensor>("Out");
auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
float alpha = context.Attr<float>("alpha");
float beta = context.Attr<float>("beta");
auto x_dim = X->dims();
int batch_size = 0;
int max_seq_len = 0;
int enc_size = 0;
if (x_lod.empty()) {
PADDLE_ENFORCE(
x_dim.size() == 3UL,
"The input X of Add Position Encoding should be 3-D Tensor!");
batch_size = x_dim[0];
max_seq_len = x_dim[1];
enc_size = x_dim[2];
} else {
PADDLE_ENFORCE(
x_dim.size() == 2UL,
"The input X of Add Position Encoding should be 2-D LoDTensor!");
PADDLE_ENFORCE(
x_lod.size() == 1UL,
"The Add Position Encoding Op only supports lod_level == 1!");
batch_size = x_lod[0].size() - 1;
max_seq_len = -1;
enc_size = x_dim[1];
}
PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
const int half_size = enc_size / 2;
for (int i = 0; i < batch_size; ++i) {
const int max_length =
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
for (int j = 0; j < max_length; ++j) {
for (int k = 0; k < half_size; ++k) {
const double val = (half_size > 1)
? j / pow(10000.0, double(k) / (half_size - 1))
: j / 10000.0;
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
dst_ptr[half_size + k] =
src_ptr[half_size + k] * alpha + cos(val) * beta;
}
src_ptr += enc_size;
dst_ptr += enc_size;
}
}
}
};
template <typename DeviceContext, typename T>
class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* dOut =
context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto* dX =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dX);
float alpha = context.Attr<float>("alpha");
auto* place =
context.template device_context<DeviceContext>().eigen_device();
dx.device(*place) = dout * static_cast<T>(alpha);
}
};
} // namespace operators
} // namespace paddle
...@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ...@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<int>, ops::GatherOpKernel<double>); ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>, ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<double>); ops::GatherGradientOpKernel<int64_t>);
...@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>); REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>); ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>);
...@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T, bool is_test>
class MaxSeqPoolFunctor { class MaxSeqPoolFunctor {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
...@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor { ...@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
} }
} }
}; };
// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
// index buffer
template <typename T>
class MaxSeqPoolFunctor<T, true> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
dim * sizeof(T));
for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
for (int64_t k = 0; k < dim; ++k) {
if (in_data[j * dim + k] > out_data[i * dim + k]) {
out_data[i * dim + k] = in_data[j * dim + k];
}
}
}
}
}
};
template <typename T> template <typename T>
class MaxSeqPoolGradFunctor { class MaxSeqPoolGradFunctor {
public: public:
...@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
/* max pool has index output */ /* max pool has index output */
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input, const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output, framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) { framework::Tensor* index = nullptr) {
if (pooltype == "MAX") { if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<T> max_pool; if (is_test) {
max_pool(context, input, output, index); math::MaxSeqPoolFunctor<T, true> max_pool;
max_pool(context, input, output, index);
} else {
math::MaxSeqPoolFunctor<T, false> max_pool;
max_pool(context, input, output, index);
}
return; return;
} }
if (pooltype == "LAST") { if (pooltype == "LAST") {
...@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool(context, input, output); last_pool(context, input, output);
return; return;
} }
if (pooltype == "FIRST") { if (pooltype == "FIRST") {
math::FirstSeqPoolFunctor<T> first_pool; math::FirstSeqPoolFunctor<T> first_pool;
first_pool(context, input, output); first_pool(context, input, output);
......
...@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> { ...@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input, const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output, framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) { framework::Tensor* index = nullptr) {
auto& lod = input.lod()[0]; auto& lod = input.lod()[0];
const size_t item_dim = output->numel() / output->dims()[0]; const size_t item_dim = output->numel() / output->dims()[0];
......
...@@ -28,7 +28,7 @@ class SequencePoolFunctor { ...@@ -28,7 +28,7 @@ class SequencePoolFunctor {
/* max pool has index output */ /* max pool has index output */
void operator()(const DeviceContext& context, const std::string pooltype, void operator()(const DeviceContext& context, const std::string pooltype,
const framework::LoDTensor& input, framework::Tensor* output, const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index = nullptr); bool is_test = false, framework::Tensor* index = nullptr);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor<int>) This tensor is used for the sequence max-pooling " "(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes.") "to record the max indexes.")
.AsIntermediate(); .AsIntermediate();
AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"pooltype", "pooltype",
"(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
......
...@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<Tensor>("Out"); auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
Tensor* index = nullptr;
if (pooltype == "MAX") {
index = context.Output<Tensor>("MaxIndex");
}
auto dims = in->dims(); auto dims = in->dims();
auto lod = in->lod(); auto lod = in->lod();
...@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
dims[0] = lod[0].size() - 1; dims[0] = lod[0].size() - 1;
out->Resize({dims}); out->Resize({dims});
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") { Tensor* index = nullptr;
const bool is_test = context.Attr<bool>("is_test");
// Do not create index buffer for inference (is_test) mode
// TODO(jczaja): Skip index buffer creation for other devices eg. GPU
if (pooltype == "MAX" &&
(is_test == false ||
platform::is_cpu_place(context.GetPlace()) == false)) {
index = context.Output<Tensor>("MaxIndex");
index->Resize({dims}); index->Resize({dims});
index->mutable_data<int>(context.GetPlace()); index->mutable_data<int>(context.GetPlace());
} }
math::SequencePoolFunctor<DeviceContext, T> pool; math::SequencePoolFunctor<DeviceContext, T> pool;
pool(context.template device_context<DeviceContext>(), pooltype, *in, out, pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
index); is_test, index);
} }
}; };
......
...@@ -82,14 +82,16 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -82,14 +82,16 @@ class SumOp : public framework::OperatorWithKernel {
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
int dtype = -1; int dtype = -1;
for (auto& x_var : x_vars) { for (auto& x_var : x_vars) {
auto& lod_tensor = x_var->Get<framework::LoDTensor>(); // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
if (lod_tensor.numel() == 0) { auto tensor = framework::GetTensorFromVar(
const_cast<framework::Variable*>(x_var));
if (tensor->numel() == 0) {
continue; continue;
} }
if (dtype == -1) { if (dtype == -1) {
dtype = framework::ToDataType(lod_tensor.type()); dtype = framework::ToDataType(tensor->type());
} else { } else {
PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
} }
} }
PADDLE_ENFORCE_NE(dtype, -1, PADDLE_ENFORCE_NE(dtype, -1,
......
...@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { ...@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
"'Place' is not supported, Please re-compile with WITH_GPU " "'Place' is not supported, Please re-compile with WITH_GPU "
"option"); "option");
} }
return it->second.get(); return it->second.get().get();
} }
const std::vector<const DeviceContext*> template <typename DevCtx, typename PlaceType>
DeviceContextPool::GetAllDeviceContexts() const { inline void EmplaceDeviceContext(
std::vector<const DeviceContext*> all_device_ctx; std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
all_device_ctx.reserve(device_contexts_.size()); map_ptr,
for (auto& dev_ctx : device_contexts_) { platform::Place p) {
all_device_ctx.emplace_back(dev_ctx.second.get()); using PtrType = std::unique_ptr<DeviceContext>;
} map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
return all_device_ctx; // lazy evaluation. i.e., only create device context at
// first `Get`
return PtrType(new DevCtx(boost::get<PlaceType>(p)));
}));
} }
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
using PtrType = std::unique_ptr<DeviceContext>;
std::set<Place> set; std::set<Place> set;
for (auto& p : places) { for (auto& p : places) {
set.insert(p); set.insert(p);
...@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool( ...@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
for (auto& p : set) { for (auto& p : set) {
if (platform::is_cpu_place(p)) { if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace( EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
#else #else
device_contexts_.emplace( EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
#endif #endif
} else if (platform::is_gpu_place(p)) { } else if (platform::is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace( EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool( ...@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
#endif #endif
} else if (platform::is_cuda_pinned_place(p)) { } else if (platform::is_cuda_pinned_place(p)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace( EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
p, &device_contexts_, p);
PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <future> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
...@@ -223,9 +224,6 @@ class DeviceContextPool { ...@@ -223,9 +224,6 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
platform::DeviceContext* Get(const platform::Place& place); platform::DeviceContext* Get(const platform::Place& place);
/*! \brief Return all the device contexts. */
const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
template <typename Place> template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace( const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) { const Place& place) {
...@@ -237,7 +235,8 @@ class DeviceContextPool { ...@@ -237,7 +235,8 @@ class DeviceContextPool {
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_; std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool); DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
}; };
......
...@@ -153,7 +153,6 @@ function cmake_gen() { ...@@ -153,7 +153,6 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
...@@ -186,7 +185,6 @@ EOF ...@@ -186,7 +185,6 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
...@@ -653,7 +651,7 @@ function gen_capi_package() { ...@@ -653,7 +651,7 @@ function gen_capi_package() {
function gen_fluid_lib() { function gen_fluid_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Generating fluid library for train and inference ... Generating fluid library for train and inference ...
...@@ -666,7 +664,7 @@ EOF ...@@ -666,7 +664,7 @@ EOF
} }
function tar_fluid_lib() { function tar_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Taring fluid library for train and inference ... Taring fluid library for train and inference ...
...@@ -681,7 +679,7 @@ EOF ...@@ -681,7 +679,7 @@ EOF
} }
function test_fluid_lib() { function test_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Testing fluid library for inference ... Testing fluid library for inference ...
......
...@@ -157,6 +157,8 @@ __all__ = [ ...@@ -157,6 +157,8 @@ __all__ = [
'sequence_reverse', 'sequence_reverse',
'affine_channel', 'affine_channel',
'hash', 'hash',
'log_loss',
'add_position_encoding',
] ]
...@@ -747,7 +749,7 @@ def dynamic_gru(input, ...@@ -747,7 +749,7 @@ def dynamic_gru(input,
attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
batch_size = input.shape[0] batch_size = input.shape[0]
inputs = {'Input': input, 'Weight': weight, 'Bias': bias} inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
if h_0 != None: if h_0:
assert h_0.shape == ( assert h_0.shape == (
batch_size, size batch_size, size
), 'The shape of h0 should be(batch_size, %d)' % size ), 'The shape of h0 should be(batch_size, %d)' % size
...@@ -1823,7 +1825,7 @@ def conv3d(input, ...@@ -1823,7 +1825,7 @@ def conv3d(input,
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
def sequence_pool(input, pool_type): def sequence_pool(input, pool_type, is_test=False):
""" """
This function add the operator for sequence pooling. This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied It pools features of all time-steps of each instance, and is applied
...@@ -1860,6 +1862,7 @@ def sequence_pool(input, pool_type): ...@@ -1860,6 +1862,7 @@ def sequence_pool(input, pool_type):
input(variable): The input variable which is a LoDTensor. input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool. pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max. It supports average, sum, sqrt and max.
is_test(bool, Default False): Used distinguish training from scoring mode.
Returns: Returns:
The sequence pooling variable which is a Tensor. The sequence pooling variable which is a Tensor.
...@@ -1887,7 +1890,8 @@ def sequence_pool(input, pool_type): ...@@ -1887,7 +1890,8 @@ def sequence_pool(input, pool_type):
inputs={"X": input}, inputs={"X": input},
outputs={"Out": pool_out, outputs={"Out": pool_out,
"MaxIndex": max_index}, "MaxIndex": max_index},
attrs={"pooltype": pool_type.upper()}) attrs={"pooltype": pool_type.upper(),
"is_test": is_test})
# when pool_type is max, variable max_index is initialized, # when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here # so we stop the gradient explicitly here
...@@ -7580,3 +7584,99 @@ def hash(input, hash_size, num_hash=1, name=None): ...@@ -7580,3 +7584,99 @@ def hash(input, hash_size, num_hash=1, name=None):
attrs={'num_hash': num_hash, attrs={'num_hash': num_hash,
'mod_by': hash_size}) 'mod_by': hash_size})
return out return out
def log_loss(input, label, epsilon=1e-4, name=None):
"""
**Negative Log Loss Layer**
This layer accepts input predictions and target label and returns the
negative log loss.
.. math::
Out = -label * \\log{(input + \\epsilon)}
- (1 - label) * \\log{(1 - input + \\epsilon)}
Args:
input (Variable|list): a 2-D tensor with shape [N x 1], where N is the
batch size. This input is a probability computed
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
epsilon (float): epsilon
name (string): the name of log_loss
Returns:
Variable: A 2-D tensor with shape [N x 1], the negative log loss.
Examples:
.. code-block:: python
prob = fluid.layers.sigmoid(net)
cost = fluid.layers.log_loss(input=prob, label=label)
"""
helper = LayerHelper('log_loss', **locals())
if name is None:
loss = helper.create_variable_for_type_inference(dtype=input.dtype)
else:
loss = helper.create_variable(
name=name, dtype=input.dtype, persistable=False)
helper.append_op(
type='log_loss',
inputs={'Predicted': [input],
'Labels': [label]},
outputs={'Loss': [loss]},
attrs={'epsilon': epsilon})
return loss
def add_position_encoding(input, alpha, beta, name=None):
"""
**Add Position Encoding Layer**
This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
output Tensor of shape [N x M x P] with positional encoding value.
Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
.. math::
PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})} \\\\
PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})} \\\\
Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
Where:
* PE(pos, 2i): the increment for the number at even position
* PE(pos, 2i + 1): the increment for the number at odd position
Args:
input (Variable): 3-D input tensor with shape [N x M x P]
alpha (float): multiple of Input Tensor
beta (float): multiple of Positional Encoding Tensor
name (string): the name of position encoding layer
Returns:
Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
Examples:
.. code-block:: python
position_tensor = fluid.layers.add_position_encoding(input=tensor)
"""
helper = LayerHelper('add_position_encoding', **locals())
dtype = helper.input_dtype()
if name is None:
out = helper.create_variable_for_type_inference(dtype=dtype)
else:
out = helper.create_variable(name=name, dtype=dtype, persistable=False)
helper.append_op(
type="add_position_encoding",
inputs={"X": input},
outputs={"Out": out},
attrs={"alpha": alpha,
"beta": beta})
return out
...@@ -194,7 +194,7 @@ class CompositeMetric(MetricBase): ...@@ -194,7 +194,7 @@ class CompositeMetric(MetricBase):
or soft-label, should custom the corresponding update rule. or soft-label, should custom the corresponding update rule.
""" """
for m in self._metrics: for m in self._metrics:
ans.append(m.update(preds, labels)) m.update(preds, labels)
def eval(self): def eval(self):
""" """
......
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test if(NOT APPLE)
foreach(src ${TEST_OPS}) # default test
py_test(${src} SRCS ${src}.py) foreach(src ${TEST_OPS})
endforeach() py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_image_classification_vgg")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_image_classification_resnet")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
...@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE) ...@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME) ...@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
if (py_test_modules_SERIAL) if (py_test_modules_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif() endif()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
...@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
endif()
...@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
# Optimization # Optimization
opt = fluid.optimizer.AdamOptimizer( # TODO(typhoonzero): fix distributed adam optimizer
learning_rate=0.001, beta1=0.9, beta2=0.999) # opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
# Reader # Reader
train_reader = paddle.batch( train_reader = paddle.batch(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import math
import paddle.fluid.core as core
from op_test import OpTest
class TestAddPositionEncodingTensorOp(OpTest):
"""
This class is to test the AddPositionEncodingOp
"""
def setUp(self):
"""
the prepared section for add position encoding op
"""
self.op_type = "add_position_encoding"
self.dtype = np.float32
self.init_input_output()
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
self.outputs = {'Out': self.out}
self.attrs = {'alpha': self.alpha, 'beta': self.beta}
def test_check_output(self):
"""
check the correctness of output
"""
self.check_output()
def test_check_grad(self):
"""
check the correctness of grad
"""
self.check_grad(['X'], 'Out', max_relative_error=0.005)
def init_input_output(self):
"""
init the input and output for test cases
"""
self.alpha = 0.6
self.beta = 0.5
self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
self.out = np.copy(self.x)
batch_size = self.x.shape[0]
max_length = self.x.shape[1]
enc_size = self.x.shape[2]
half_shape = int(enc_size / 2)
for i in range(batch_size):
for j in range(max_length):
for k in range(half_shape):
val = j / pow(10000.0, k / (
half_shape - 1)) if half_shape > 1 else j / 10000.0
self.out[i, j, k] = \
self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
self.out[i, j, half_shape + k] = \
self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
class TestAddPositionEncodingLoDTensorOp(OpTest):
"""
This class is to test the AddPositionEncodingLoDTensorOp
"""
def setUp(self):
"""
the prepared section for add position encoding LoDTensor op
"""
self.op_type = "add_position_encoding"
self.dtype = np.float32
self.init_input_output()
self.inputs = {'X': (self.x, self.lod), }
self.outputs = {'Out': (self.out, self.lod)}
self.attrs = {'alpha': self.alpha, 'beta': self.beta}
def test_check_output(self):
"""
check the correctness of output
"""
self.check_output()
def test_check_grad(self):
"""
check the correctness of grad
"""
self.check_grad(['X'], 'Out', max_relative_error=0.005)
def init_input_output(self):
"""
init the input and output for test cases
"""
self.alpha = 0.6
self.beta = 0.5
self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
self.lod = [[3, 7]]
self.out = np.copy(self.x)
batch_size = len(self.lod[0])
enc_size = self.x.shape[1]
start = 0
half_shape = int(enc_size / 2)
for i in range(batch_size):
max_length = self.lod[0][i]
for j in range(max_length):
for k in range(half_shape):
val = j / pow(10000.0, k / (
half_shape - 1)) if half_shape > 1 else j / 10000.0
pos = start + j
self.out[pos, k] = \
self.x[pos, k] * self.alpha + math.sin(val) * self.beta
self.out[pos, half_shape + k] = \
self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
start += max_length
if __name__ == '__main__':
unittest.main()
...@@ -22,6 +22,8 @@ import signal ...@@ -22,6 +22,8 @@ import signal
import subprocess import subprocess
import six import six
import argparse import argparse
import pickle
import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -128,10 +130,15 @@ class TestDistRunnerBase(object): ...@@ -128,10 +130,15 @@ class TestDistRunnerBase(object):
else: else:
return origin_batch return origin_batch
out_losses = []
for _ in six.moves.xrange(RUN_STEP): for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name], loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data())) feed=feeder.feed(get_data()))
print(loss) out_losses.append(loss[0])
if six.PY2:
print(pickle.dumps(out_losses))
else:
sys.stdout.buffer.write(pickle.dumps(out_losses))
def runtime_main(test_class): def runtime_main(test_class):
...@@ -149,7 +156,7 @@ def runtime_main(test_class): ...@@ -149,7 +156,7 @@ def runtime_main(test_class):
parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--use_reduce', action='store_true')
parser.add_argument( parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True) '--use_reader_alloc', action='store_true', required=False)
parser.add_argument('--batch_size', required=False, type=int, default=2) parser.add_argument('--batch_size', required=False, type=int, default=2)
parser.add_argument( parser.add_argument(
'--batch_merge_repeat', required=False, type=int, default=1) '--batch_merge_repeat', required=False, type=int, default=1)
...@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase): ...@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase):
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port()) self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = sys.executable
self._sync_mode = True self._sync_mode = True
self._enforce_place = None self._enforce_place = None
self._mem_opt = False self._mem_opt = False
...@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase): ...@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase):
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid):
retry_times = 50
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error as e:
sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
(e, retry_times))
retry_times -= 1
def _run_local(self, def _run_local(self,
model, model,
envs, envs,
...@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase): ...@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase):
env=envs) env=envs)
local_out, local_err = local_proc.communicate() local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(local_out)
if check_error_log: if check_error_log:
err_log.close() err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret) sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
sys.stderr.write('local_stderr: %s\n' % local_err) sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n") return pickle.loads(local_out)
return local_losses
def _run_cluster(self, model, envs, check_error_log): def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs) check_error_log, envs)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
...@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase): ...@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase):
env0.update(envs) env0.update(envs)
env1.update(envs) env1.update(envs)
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) print("tr0_cmd:{}".format(tr0_cmd))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) print("tr1_cmd:{}".format(tr1_cmd))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb")
...@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase): ...@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase):
env=env1) env=env1)
tr0_out, tr0_err = tr0_proc.communicate() tr0_out, tr0_err = tr0_proc.communicate()
tr0_loss_text = cpt.to_text(tr0_out)
tr1_out, tr1_err = tr1_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate()
tr1_loss_text = cpt.to_text(tr1_out)
# close trainer file # close trainer file
tr0_pipe.close() tr0_pipe.close()
...@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase): ...@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase):
ps1.terminate() ps1.terminate()
# print log # print log
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n") # return tr0_losses, tr1_losses
tr1_losses = tr1_loss_text.split("\n") return pickle.loads(tr0_out), pickle.loads(tr1_out)
return tr0_losses, tr1_losses
def check_with_place(self, def check_with_place(self,
model_file, model_file,
...@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase): ...@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase):
check_error_log) check_error_log)
for step_id in range(RUN_STEP): for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0] local_loss = local_losses[step_id]
tr0_loss = eval(tr0_losses[step_id])[0] tr0_loss = tr0_losses[step_id]
tr1_loss = eval(tr1_losses[step_id])[0] tr1_loss = tr1_losses[step_id]
dist_loss = (tr0_loss + tr1_loss) / 2 dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
print(str(local_loss) + ":" + str(dist_loss)) print("=======", local_loss, ":", dist_loss[0], "=======")
self.assertAlmostEqual(local_loss, dist_loss, delta=delta) self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
...@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
self._use_reader_alloc = False self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistseResnXt2x2WithMemopt(TestDistBase): class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._mem_opt = True self._mem_opt = True
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
......
...@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): ...@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
def compute(self, x, offset, out):
self.attrs = {'pooltype': "MAX", 'is_test': True}
for i in range(len(offset[0]) - 1):
sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
(-1, 3 * 11))
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
def test_check_grad(self):
"""Grad computation does not apply to Sequence MAX
Pool executed when is_test is true """
return
class TestSeqLastPool2D(TestSeqAvgPool2D): class TestSeqLastPool2D(TestSeqAvgPool2D):
def compute(self, x, offset, out): def compute(self, x, offset, out):
self.attrs = {'pooltype': "LAST"} self.attrs = {'pooltype': "LAST"}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册