未验证 提交 04b1df2a 编写于 作者: T tangwei12 提交者: GitHub

Merge branch 'develop' into dis_ckpt_fix

paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False))
paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
...@@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non ...@@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
...@@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def ...@@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...@@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None ...@@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...@@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', ...@@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs',
paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
......
...@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { ...@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() {
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
void Executor::BeginPass() {
::paddle::operators::distributed::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>() ::paddle::operators::distributed::GRPCClient>()
->SendBeginPass(); ->SendComplete();
}
void Executor::EndPass() {
::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>()
->SendEndPass();
}
#endif #endif
}
void InitializeVariable(Variable* var, proto::VarType::Type var_type) { void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) { if (var_type == proto::VarType::LOD_TENSOR) {
......
...@@ -44,17 +44,11 @@ class Executor { ...@@ -44,17 +44,11 @@ class Executor {
explicit Executor(const platform::Place& place); explicit Executor(const platform::Place& place);
#ifdef PADDLE_WITH_DISTRIBUTE
/* /*
* Sending signal to pserver to mark current pass started. * Close this Executor.
* Calling this method will send complete messages to all pserver instances.
*/ */
void BeginPass(); void Close();
/*
* Sending signal to pserver to mark current pass finished.
*/
void EndPass();
#endif
/* @Brief /* @Brief
* Runtime evaluation of the given ProgramDesc under certain Scope * Runtime evaluation of the given ProgramDesc under certain Scope
......
...@@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
executor_->RunPreparedContext( executor_->RunPreparedContext(
ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(), ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
&feed_targets, &fetch_targets, &feed_targets, &fetch_targets,
false, /* don't create local scope each time*/
false /* don't create variable eatch time */); false /* don't create variable eatch time */);
VLOG(4) << "Finish prepared context"; VLOG(4) << "Finish prepared context";
if (!GetFetch(fetchs, output_data)) { if (!GetFetch(fetchs, output_data)) {
......
...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, ...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
for (int h = 0; h < shape.h(); ++h) { for (int h = 0; h < shape.h(); ++h) {
for (int w = 0; w < shape.w(); ++w) { for (int w = 0; w < shape.w(); ++w) {
odata[h * ostrides.h() + w * ostrides.w()] = odata[h * ostrides.h() + w * ostrides.w()] =
idata[h * ostrides.h() + w * ostrides.w()]; idata[h * istrides.h() + w * istrides.w()];
} }
} }
} }
// indata c * k
// Reorder the data layout from CK to KC. // Reorder the data layout from CK to KC.
void ReorderCKtoKC(TensorRTEngine::Weight& iweights, void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
TensorRTEngine::Weight* oweights) { TensorRTEngine::Weight* oweights) {
...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter { ...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter {
framework::LoDTensor tmp; framework::LoDTensor tmp;
tmp.Resize(Y_t->dims()); tmp.Resize(Y_t->dims());
memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(), memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
Y_t->dims()[0] * Y_t->dims()[1]); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; Y_t->memory_size() / sizeof(float)};
...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter { ...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter {
// The data layout of TRT FC layer's weight is different from fluid's FC, // The data layout of TRT FC layer's weight is different from fluid's FC,
// need to reorder the elements. // need to reorder the elements.
ReorderCKtoKC(tmp_weight, &weight); ReorderCKtoKC(weight, &tmp_weight);
// Currently, the framework can only handle one fluid op -> one TRT layer, // Currently, the framework can only handle one fluid op -> one TRT layer,
// but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter { ...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter {
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
*const_cast<nvinfer1::ITensor*>(X), *const_cast<nvinfer1::ITensor*>(X),
n_output, weight.get(), bias.get()); n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter { ...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter {
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
USE_OP(mul);
...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { ...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -23,11 +23,11 @@ namespace tensorrt { ...@@ -23,11 +23,11 @@ namespace tensorrt {
TEST(fc_op, test) { TEST(fc_op, test) {
std::unordered_set<std::string> parameters({"mul-Y"}); std::unordered_set<std::string> parameters({"mul-Y"});
framework::Scope scope; framework::Scope scope;
TRTConvertValidation validator(20, parameters, scope, 1000); TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
// Prepare Op description // Prepare Op description
framework::OpDesc desc; framework::OpDesc desc;
...@@ -38,9 +38,10 @@ TEST(fc_op, test) { ...@@ -38,9 +38,10 @@ TEST(fc_op, test) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul);
...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { ...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -39,7 +39,7 @@ namespace tensorrt { ...@@ -39,7 +39,7 @@ namespace tensorrt {
float random(float low, float high) { float random(float low, float high) {
static std::random_device rd; static std::random_device rd;
static std::mt19937 mt(rd()); static std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(1.0, 10.0); std::uniform_real_distribution<double> dist(low, high);
return dist(mt); return dist(mt);
} }
...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, ...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
size_t num_elements = analysis::AccuDims(dims, dims.size()); size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0); PADDLE_ENFORCE_GT(num_elements, 0);
auto* data = tensor->mutable_data<float>(place); auto* data = tensor->mutable_data<float>(place);
for (size_t i = 0; i < num_elements; i++) { for (size_t i = 0; i < num_elements; i++) {
*(data + i) = random(0., 1.); *(data + i) = random(0., 1.);
} }
...@@ -68,7 +69,7 @@ class TRTConvertValidation { ...@@ -68,7 +69,7 @@ class TRTConvertValidation {
int workspace_size = 1 << 10) int workspace_size = 1 << 10)
: parameters_(parameters), scope_(scope) { : parameters_(parameters), scope_(scope) {
// create engine. // create engine.
engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
engine_->InitNetwork(); engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
...@@ -138,12 +139,11 @@ class TRTConvertValidation { ...@@ -138,12 +139,11 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 200; const size_t output_space_size = 2000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
output_space_size * sizeof(float));
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License"); you may not use
you may not use this file except in compliance with the License. this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -26,6 +26,8 @@ namespace paddle { ...@@ -26,6 +26,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::Build(const DescType &paddle_model) { void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) {
PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() {
auto dims = infer_engine_->getBindingDimensions(slot_offset); auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>( item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] * infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0); PADDLE_ENFORCE_GT(item.second, 0);
} }
auto &buf = buffer(item.first); auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_; buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once. CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
// buf.size will changed in the runtime.
buf.size = 0; buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU; buf.device = DeviceType::GPU;
} }
} }
...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, ...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
auto *input = infer_network_->addInput(name.c_str(), dtype, dims); auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE(input->isNetworkInput()); PADDLE_ENFORCE(input->isNetworkInput());
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { ...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, *stream_),
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
VLOG(4) << "get output in cpu";
auto &buf = buffer(name);
// Update needed buffer size.
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
buf.size = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
// DEBUG PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
memset(dst, 0, buf.size); cudaMemcpyDeviceToHost, *stream_));
PADDLE_ENFORCE_EQ(
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
} }
Buffer &TensorRTEngine::buffer(const std::string &name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { ...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
return itensor_map_[name]; return itensor_map_[name];
} }
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase { ...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
// the runtime batch size
static int runtime_batch_;
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
......
...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(1, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu, x_v * 2 + 3);
...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions(); auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1); ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3});
auto* conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
engine_->DeclareOutput(conv_layer, 0, "y");
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
18 * sizeof(float));
engine_->Execute(2);
LOG(INFO) << "to get output";
float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname, ...@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname,
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx; std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
bool CreateLocalScope = CreateVars;
if (PrepareContext) { if (PrepareContext) {
ctx = executor.Prepare(*inference_program, 0); ctx = executor.Prepare(*inference_program, 0);
executor.RunPreparedContext(ctx.get(), scope, &feed_targets, executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
&fetch_targets, true, CreateVars); &fetch_targets, CreateLocalScope, CreateVars);
} else { } else {
executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
true, CreateVars); CreateLocalScope, CreateVars);
} }
// Enable the profiler // Enable the profiler
...@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname, ...@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname,
// Note: if you change the inference_program, you need to call // Note: if you change the inference_program, you need to call
// executor.Prepare() again to get a new ExecutorPrepareContext. // executor.Prepare() again to get a new ExecutorPrepareContext.
executor.RunPreparedContext(ctx.get(), scope, &feed_targets, executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
&fetch_targets, CreateVars); &fetch_targets, CreateLocalScope,
CreateVars);
} else { } else {
executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
CreateVars); CreateLocalScope, CreateVars);
} }
} }
......
...@@ -18,7 +18,7 @@ if(WITH_GRPC) ...@@ -18,7 +18,7 @@ if(WITH_GRPC)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(grpc_serde_test SRCS grpc_serde_test.cc cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(grpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL)
return() return()
endif() endif()
......
...@@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() { ...@@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() {
client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
} }
void GRPCClient::SendBeginPass() { void GRPCClient::SendComplete() {
std::unique_lock<std::mutex> lk(completed_mutex_);
if (!completed_) {
for (auto& it : channels_) { for (auto& it : channels_) {
VLOG(3) << "send begin pass to: " << it.first; VLOG(3) << "send complete message to " << it.first;
this->AsyncSendBeginPass(it.first); this->AsyncSendComplete(it.first);
} }
this->Wait(); PADDLE_ENFORCE(this->Wait(), "internal grpc error");
} completed_ = true;
void GRPCClient::SendEndPass() {
for (auto& it : channels_) {
VLOG(3) << "send end pass to " << it.first;
this->AsyncSendEndPass(it.first);
} }
this->Wait();
} }
GRPCClient::~GRPCClient() { GRPCClient::~GRPCClient() {
...@@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
req_count_++; req_count_++;
} }
void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) { void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out); s->Prepare(time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BEGIN_PASS_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
} }
void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
s->Prepare(time_out);
sendrecv::VariableMessage req;
req.set_varname(END_PASS_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
}
void GRPCClient::AsyncCheckpointNotify(const std::string& ep, void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir, const std::string& dir,
int64_t time_out) { int64_t time_out) {
......
...@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { ...@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class GRPCClient : public RPCClient { class GRPCClient : public RPCClient {
public: public:
GRPCClient() : ok_(true) {} GRPCClient() : ok_(true), completed_(false) {}
virtual ~GRPCClient(); virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
...@@ -201,17 +201,12 @@ class GRPCClient : public RPCClient { ...@@ -201,17 +201,12 @@ class GRPCClient : public RPCClient {
void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBeginPass(const std::string& ep, void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendEndPass(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
bool Wait() override; bool Wait() override;
void SendBeginPass() override; void SendComplete() override;
void SendEndPass() override;
protected: protected:
void InitImpl() override; void InitImpl() override;
...@@ -238,6 +233,10 @@ class GRPCClient : public RPCClient { ...@@ -238,6 +233,10 @@ class GRPCClient : public RPCClient {
// mutex for GetChannel thread safety // mutex for GetChannel thread safety
std::mutex chan_mutex_; std::mutex chan_mutex_;
DISABLE_COPY_AND_ASSIGN(GRPCClient); DISABLE_COPY_AND_ASSIGN(GRPCClient);
// mutex for sending complete message only once
std::mutex completed_mutex_;
bool completed_;
}; };
} // namespace distributed } // namespace distributed
......
...@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; ...@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define COMPLETE_MESSAGE "COMPLETE@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV"
#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
#define END_PASS_MESSAGE "END_PASS@RECV"
#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
......
...@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
if (varname == BATCH_BARRIER_MESSAGE) { if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend); rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == BEGIN_PASS_MESSAGE) { } else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv begin pass message"; VLOG(3) << "sync: recv complete message";
rpc_server_->WaitCond(kRequestSend); rpc_server_->Complete();
rpc_server_->BeginPass();
} else { } else {
VLOG(3) << "sync: received var_name: " << varname; VLOG(3) << "sync: received var_name: " << varname;
rpc_server_->WaitCond(kRequestSend); rpc_server_->WaitCond(kRequestSend);
...@@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
if (varname == FETCH_BARRIER_MESSAGE) { if (varname == FETCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv fetch barrier message"; VLOG(3) << "sync: recv fetch barrier message";
rpc_server_->IncreaseBatchBarrier(kRequestGet); rpc_server_->IncreaseBatchBarrier(kRequestGet);
} else if (varname == END_PASS_MESSAGE) {
rpc_server_->EndPass();
} else { } else {
rpc_server_->WaitCond(kRequestGet); rpc_server_->WaitCond(kRequestGet);
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
} }
} else { } else {
if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) { if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
} }
} }
......
...@@ -60,17 +60,13 @@ class RPCClient { ...@@ -60,17 +60,13 @@ class RPCClient {
const std::string& dir, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendBeginPass(const std::string& ep, virtual void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendEndPass(const std::string& ep, // Complete tells all the pserver instances that finishe the training,
int64_t time_out = FLAGS_rpc_deadline) = 0; // the pserver can reduce it's barrier count, and continue to train
// BeginePass/EndPass tells all the pserver that start/end a pass, so that
// the pserver can increase/reduce it's barrier count, and continue to train
// with other trainers. // with other trainers.
virtual void SendBeginPass() = 0; virtual void SendComplete() = 0;
virtual void SendEndPass() = 0;
virtual bool Wait() = 0; virtual bool Wait() = 0;
......
...@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { ...@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
} }
} }
void RPCServer::BeginPass() { void RPCServer::Complete() {
VLOG(4) << "RPCServer begin increase pass barrier";
{
std::unique_lock<std::mutex> lock(mutex_);
client_num_++;
VLOG(4) << "increase client_num to: " << client_num_;
}
barrier_cond_.notify_all();
}
void RPCServer::EndPass() {
VLOG(4) << "RPCServer begin increase pass barrier";
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
client_num_--; client_num_--;
...@@ -87,6 +76,11 @@ void RPCServer::EndPass() { ...@@ -87,6 +76,11 @@ void RPCServer::EndPass() {
barrier_cond_.notify_all(); barrier_cond_.notify_all();
} }
int RPCServer::GetClientNum() {
std::unique_lock<std::mutex> lock(mutex_);
return client_num_;
}
void RPCServer::ResetBarrierCounter() { void RPCServer::ResetBarrierCounter() {
VLOG(3) << "RPCServer ResetBarrierCounter "; VLOG(3) << "RPCServer ResetBarrierCounter ";
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
......
...@@ -44,7 +44,7 @@ class RPCServer { ...@@ -44,7 +44,7 @@ class RPCServer {
int GetSelectedPort() const { return selected_port_; } int GetSelectedPort() const { return selected_port_; }
int GetClientNum() const; int GetClientNum();
void SavePort() const; void SavePort() const;
...@@ -64,8 +64,7 @@ class RPCServer { ...@@ -64,8 +64,7 @@ class RPCServer {
void WaitCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name);
void IncreaseBatchBarrier(const std::string rpc_name); void IncreaseBatchBarrier(const std::string rpc_name);
void BeginPass(); void Complete();
void EndPass();
void ResetBarrierCounter(); void ResetBarrierCounter();
......
...@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, ...@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
} }
} }
void StartServer() { void StartServer(const std::string& rpc_name) {
framework::ProgramDesc program; framework::ProgramDesc program;
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CPUPlace place;
...@@ -107,14 +107,14 @@ void StartServer() { ...@@ -107,14 +107,14 @@ void StartServer() {
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
prefetch_var_name_to_prepared; prefetch_var_name_to_prepared;
prefetch_var_name_to_prepared[in_var_name] = prepared[0]; prefetch_var_name_to_prepared[in_var_name] = prepared[0];
g_req_handler->SetProgram(&program); g_req_handler->SetProgram(&program);
g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
g_req_handler->SetDevCtx(&ctx); g_req_handler->SetDevCtx(&ctx);
g_req_handler->SetScope(&scope); g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe); g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(distributed::kRequestPrefetch, g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
...@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) { ...@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) {
distributed::RPCClient* client = distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::thread server_thread(StartServer); std::thread server_thread(StartServer, distributed::kRequestPrefetch);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort(); int port = g_rpc_service->GetSelectedPort();
...@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) { ...@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) {
g_rpc_service.reset(nullptr); g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr); g_req_handler.reset(nullptr);
} }
TEST(COMPLETE, CPU) {
g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
PADDLE_ENFORCE(client != nullptr);
std::thread server_thread(StartServer, distributed::kRequestSend);
g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort();
std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
client->AsyncSendComplete(ep);
client->Wait();
EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
g_rpc_service->ShutDown();
server_thread.join();
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
...@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue { ...@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue {
public: public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) { bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(lod_tensor_vec); return queue_.Send(lod_tensor_vec);
} }
bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) { bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(std::move(lod_tensor_vec)); return queue_.Send(std::move(lod_tensor_vec));
} }
...@@ -65,21 +63,6 @@ class LoDTensorBlockingQueue { ...@@ -65,21 +63,6 @@ class LoDTensorBlockingQueue {
inline bool IsClosed() const { return queue_.IsClosed(); } inline bool IsClosed() const { return queue_.IsClosed(); }
private: private:
void CheckDims(
const std::vector<framework::LoDTensor>& lod_tensor_vec) const {
PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
"Expect input size is %d but found %s", dims_.size(),
lod_tensor_vec.size());
for (size_t i = 0; i < dims_.size(); ++i) {
const auto& in_dims = framework::slice_ddim(
lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
const auto& expect_dims =
framework::slice_ddim(dims_[i], 1, dims_[i].size());
PADDLE_ENFORCE(in_dims == expect_dims,
"Dims of the %d-th input tensor do not match", i);
}
}
BlockingQueue<std::vector<framework::LoDTensor>> queue_; BlockingQueue<std::vector<framework::LoDTensor>> queue_;
std::vector<framework::DDim> dims_; std::vector<framework::DDim> dims_;
}; };
......
...@@ -216,7 +216,7 @@ class ReshapeKernel { ...@@ -216,7 +216,7 @@ class ReshapeKernel {
if (shape_tensor) { if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>(); auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor; framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(shape_tensor->place())) {
TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
} }
......
...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions"); "TensorRT' tensor input requires at most 4 dimensions");
switch (shape.size()) { switch (shape.size()) {
case 2: case 2:
return nvinfer1::Dims2(shape[0], shape[1]); return nvinfer1::Dims2(1, shape[1]);
case 3: case 3:
return nvinfer1::Dims3(shape[0], shape[1], shape[2]); return nvinfer1::Dims3(1, shape[1], shape[2]);
case 4: case 4:
return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
default: default:
return nvinfer1::Dims(); return nvinfer1::Dims();
} }
......
...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y); auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
fluid_t->Resize(framework::make_ddim(ddim)); fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the // TODO(Superjomn) find some way to determine which device to output the
// tensor. // tensor.
// if (platform::is_cpu_place(fluid_t->place())) { // if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(y, engine->GetOutputInCPU(y,
fluid_t->mutable_data<float>(platform::CPUPlace()), fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float)); size * sizeof(float));
......
...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) { ...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) {
LOG(INFO) << "create block desc"; LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_); framework::BlockDesc block_desc(&program, block_);
LOG(INFO) << "create mul op"; LOG(INFO) << "create fc op";
auto* mul = block_desc.AppendOp(); auto* fc0 = block_desc.AppendOp();
mul->SetType("mul"); fc0->SetType("fc");
mul->SetInput("X", std::vector<std::string>({"x"})); // 2 x 4 fc0->SetInput("X", std::vector<std::string>({"x"})); // 4 x 1 x 1
mul->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6 fc0->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6
mul->SetOutput("Out", std::vector<std::string>({"z"})); // 2 x 6 fc0->SetOutput("Out", std::vector<std::string>({"z"})); // 6 x 1 x 1
LOG(INFO) << "create fc op"; LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp(); auto* fc1 = block_desc.AppendOp();
fc->SetType("mul"); fc1->SetType("fc");
fc->SetInput("X", std::vector<std::string>({"z"})); fc1->SetInput("X", std::vector<std::string>({"z"}));
fc->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8 fc1->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8
fc->SetOutput("Out", std::vector<std::string>({"z0"})); // 2 x 8 fc1->SetOutput("Out", std::vector<std::string>({"z0"})); // 8 x 1 x 1
// Set inputs' variable shape in BlockDesc // Set inputs' variable shape in BlockDesc
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4})); // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6})); AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8})); AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6})); AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
// It is wired, need to copy manually. // It is wired, need to copy manually.
*block_->add_ops() = *mul->Proto(); *block_->add_ops() = *fc0->Proto();
*block_->add_ops() = *fc->Proto(); *block_->add_ops() = *fc1->Proto();
ASSERT_EQ(block_->ops_size(), 2); ASSERT_EQ(block_->ops_size(), 2);
LOG(INFO) << "create tensorrt desc"; LOG(INFO) << "create tensorrt desc";
framework::OpDesc engine_op_desc(nullptr); framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } ...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
USE_TRT_CONVERTER(mul)
USE_TRT_CONVERTER(fc) USE_TRT_CONVERTER(fc)
...@@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::Executor>(m, "Executor") py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
#ifdef PADDLE_WITH_DISTRIBUTE .def("close", &Executor::Close)
.def("begin_pass", &Executor::BeginPass)
.def("end_pass", &Executor::EndPass)
#endif
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) { int block_id, bool create_local_scope, bool create_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
......
...@@ -247,6 +247,7 @@ class Executor(object): ...@@ -247,6 +247,7 @@ class Executor(object):
p.set_place(place) p.set_place(place)
self.executor = core.Executor(p) self.executor = core.Executor(p)
self.program_caches = dict() self.program_caches = dict()
self._closed = False
def as_lodtensor(self, data): def as_lodtensor(self, data):
""" """
...@@ -348,11 +349,23 @@ class Executor(object): ...@@ -348,11 +349,23 @@ class Executor(object):
] ]
return outs return outs
def begin_pass(self): def close(self):
self.executor.begin_pass() """
Close this executor.
You can no long use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
def end_pass(self): Example:
self.executor.end_pass() >>> cpu = core.CPUPlace()
>>> exe = Executor(cpu)
>>> ...
>>> exe.close()
"""
if not self._closed:
self.executor.close()
self._closed = True
def run(self, def run(self,
program=None, program=None,
...@@ -405,6 +418,10 @@ class Executor(object): ...@@ -405,6 +418,10 @@ class Executor(object):
>>> feed={'X': x}, >>> feed={'X': x},
>>> fetch_list=[loss.name]) >>> fetch_list=[loss.name])
""" """
if self._closed:
raise RuntimeError("Attempted to use a closed Executor")
if feed is None: if feed is None:
feed = {} feed = {}
if not isinstance(feed, dict): if not isinstance(feed, dict):
......
...@@ -32,7 +32,6 @@ except Exception, e: ...@@ -32,7 +32,6 @@ except Exception, e:
import unique_name import unique_name
__all__ = [ __all__ = [
'Variable',
'Program', 'Program',
'Operator', 'Operator',
'Parameter', 'Parameter',
...@@ -302,7 +301,7 @@ class Variable(object): ...@@ -302,7 +301,7 @@ class Variable(object):
__repr__ = __str__ __repr__ = __str__
def set_desc(self, input): def _set_desc(self, input):
""" """
Set the variable description. Set the variable description.
...@@ -347,7 +346,7 @@ class Variable(object): ...@@ -347,7 +346,7 @@ class Variable(object):
def type(self): def type(self):
return self.desc.type() return self.desc.type()
def set_error_clip(self, error_clip): def _set_error_clip(self, error_clip):
""" """
Set the error_clip. Set the error_clip.
......
...@@ -797,104 +797,6 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -797,104 +797,6 @@ def get_parameter_value_by_name(name, executor, program=None):
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
def get_test_program(filelist, program=None, startup_program=None):
"""
Transpile current train program to a program to read test dataset
if the program is using reader ops like "open_files_op".
"""
def _copy_reader_var_(block, var, new_name=None):
if new_name == None:
new_name = var.name
new_var = block.create_var(
name=str(new_name), type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True
return new_var
def _get_test_reader_name(train_reader_name):
return train_reader_name + "_test"
def _is_reader_op(op):
block = op.block
if "Out" in op.output_names:
reader_out = block.vars[op.output("Out")[0]]
if reader_out.type == core.VarDesc.VarType.READER:
return True
return False
if program == None:
program = default_main_program()
if startup_program == None:
startup_program = default_startup_program()
startup_block = startup_program.global_block()
# 1. find out the orignal reader var name
startup_reader_op_list = []
for op in startup_block.ops:
if _is_reader_op(op):
startup_reader_op_list.append(op)
if len(startup_reader_op_list) == 0:
return program
root_reader_op = startup_reader_op_list[0]
train_test_reader_map = {}
# 2. add operators to startup to read open and read test data files
for op in startup_reader_op_list:
assert (len(op.output("Out")) == 1)
train_reader_name = op.output("Out")[0]
train_reader = startup_block.vars[train_reader_name]
test_reader = _copy_reader_var_(
startup_block,
train_reader,
new_name=_get_test_reader_name(train_reader_name))
train_test_reader_map[train_reader.name] = test_reader
test_op_inputs = {}
for name in op.input_names:
train_arg_names = op.input(name)
test_arg_vars = []
for arg_name in train_arg_names:
arg_var = train_test_reader_map[
arg_name] if name == "UnderlyingReader" else startup_block.vars[
arg_name]
test_arg_vars.append(arg_var)
test_op_inputs[name] = test_arg_vars
test_op = startup_block.append_op(
type=op.type,
inputs=test_op_inputs,
outputs={'Out': [test_reader]},
attrs=op.attrs)
# root reader op's filelist attr for read test files
if op.type == root_reader_op.type:
test_op.set_attr("file_names", filelist)
if op.type == "create_multi_pass_reader":
test_op.set_attr("pass_num", 1)
# 3. rename reader vars in inference program to different name
# to avoid read from train data.
main_block = program.global_block()
for var in main_block.vars.values():
if var.type == core.VarDesc.VarType.READER:
main_block._rename_var(
str(var.name), str(_get_test_reader_name(var.name)))
for op in main_block.ops:
if op.type == root_reader_op.type:
test_op.set_attr("file_names", filelist)
if op.type == "create_multi_pass_reader":
test_op.set_attr("pass_num", 1)
startup_program._sync_with_cpp()
program._sync_with_cpp()
return program
def _load_slice_up_vars(executor, dirname, slice_vars_and_atts): def _load_slice_up_vars(executor, dirname, slice_vars_and_atts):
if slice_vars_and_atts == None or len(slice_vars_and_atts) == 0: if slice_vars_and_atts == None or len(slice_vars_and_atts) == 0:
return return
......
...@@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or ...@@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or
import numpy import numpy
__all__ = [ __all__ = [
'split_lod_tensor',
'merge_lod_tensor',
'While', 'While',
'Switch', 'Switch',
'lod_rank_table',
'max_sequence_len',
'lod_tensor_to_array',
'array_to_lod_tensor',
'increment', 'increment',
'array_write', 'array_write',
'create_array', 'create_array',
'less_than', 'less_than',
'equal', 'equal',
'array_read', 'array_read',
'shrink_memory',
'array_length', 'array_length',
'IfElse', 'IfElse',
'DynamicRNN', 'DynamicRNN',
'ConditionalBlock',
'StaticRNN', 'StaticRNN',
'reorder_lod_tensor_by_rank', 'reorder_lod_tensor_by_rank',
'ParallelDo', 'ParallelDo',
...@@ -1457,7 +1449,7 @@ class IfElse(object): ...@@ -1457,7 +1449,7 @@ class IfElse(object):
if self.status == IfElse.OUT_IF_ELSE_BLOCKS: if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
raise ValueError("input must in true/false blocks") raise ValueError("input must in true/false blocks")
if id(x) not in self.input_table: if id(x) not in self.input_table:
parent_block = self.parent_block() parent_block = self._parent_block()
out_true = parent_block.create_var( out_true = parent_block.create_var(
name=unique_name.generate('ifelse_input' + self.helper.name), name=unique_name.generate('ifelse_input' + self.helper.name),
dtype=x.dtype) dtype=x.dtype)
...@@ -1483,7 +1475,7 @@ class IfElse(object): ...@@ -1483,7 +1475,7 @@ class IfElse(object):
else: else:
return out_false return out_false
def parent_block(self): def _parent_block(self):
current_block = self.helper.main_program.current_block() current_block = self.helper.main_program.current_block()
return self.helper.main_program.block(current_block.parent_idx) return self.helper.main_program.block(current_block.parent_idx)
...@@ -1499,7 +1491,7 @@ class IfElse(object): ...@@ -1499,7 +1491,7 @@ class IfElse(object):
out_table = self.output_table[1 if self.status == out_table = self.output_table[1 if self.status ==
self.IN_IF_ELSE_TRUE_BLOCKS else 0] self.IN_IF_ELSE_TRUE_BLOCKS else 0]
parent_block = self.parent_block() parent_block = self._parent_block()
for each_out in outs: for each_out in outs:
if not isinstance(each_out, Variable): if not isinstance(each_out, Variable):
raise TypeError("Each output should be a variable") raise TypeError("Each output should be a variable")
......
...@@ -62,7 +62,7 @@ def noam_decay(d_model, warmup_steps): ...@@ -62,7 +62,7 @@ def noam_decay(d_model, warmup_steps):
The decayed learning rate. The decayed learning rate.
""" """
global_step = _decay_step_counter(1) global_step = _decay_step_counter(1)
with init_on_cpu():
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
...@@ -108,8 +108,6 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -108,8 +108,6 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu():
# update learning_rate
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
...@@ -138,7 +136,6 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -138,7 +136,6 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu():
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
...@@ -184,7 +181,6 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -184,7 +181,6 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu():
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
...@@ -224,13 +220,10 @@ def polynomial_decay(learning_rate, ...@@ -224,13 +220,10 @@ def polynomial_decay(learning_rate,
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu():
if cycle: if cycle:
div_res = ops.ceil(global_step / decay_steps) div_res = ops.ceil(global_step / decay_steps)
zero_var = tensor.fill_constant( zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
shape=[1], dtype='float32', value=0.0) one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)
one_var = tensor.fill_constant(
shape=[1], dtype='float32', value=1.0)
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
with switch.case(global_step == zero_var): with switch.case(global_step == zero_var):
...@@ -277,7 +270,6 @@ def piecewise_decay(boundaries, values): ...@@ -277,7 +270,6 @@ def piecewise_decay(boundaries, values):
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu():
lr = tensor.create_global_var( lr = tensor.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
...@@ -288,15 +280,16 @@ def piecewise_decay(boundaries, values): ...@@ -288,15 +280,16 @@ def piecewise_decay(boundaries, values):
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1], dtype='float32', value=float(boundaries[i])) shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = tensor.fill_constant( value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i])) shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val): with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr) tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant( last_value_var = tensor.fill_constant(
shape=[1], shape=[1], dtype='float32', value=float(values[len(values) - 1]))
dtype='float32',
value=float(values[len(values) - 1]))
with switch.default(): with switch.default():
tensor.assign(last_value_var, lr) tensor.assign(last_value_var, lr)
......
...@@ -35,7 +35,7 @@ if len(sys.argv) == 1: ...@@ -35,7 +35,7 @@ if len(sys.argv) == 1:
word_dict = paddle.dataset.imdb.word_dict() word_dict = paddle.dataset.imdb.word_dict()
else: else:
word_dict = load_vocab(sys.argv[1]) word_dict = load_vocab(sys.argv[1])
word_dict["<unk>"] = len(word_dict) word_dict["<unk>"] = len(word_dict)
print "Dict dim = ", len(word_dict) print "Dict dim = ", len(word_dict)
# input text data # input text data
...@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace()) ...@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
BATCH_SIZE = 128 BATCH_SIZE = 128
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=10000), paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
test_reader = paddle.batch( test_reader = paddle.batch(
......
...@@ -19,7 +19,7 @@ import sys ...@@ -19,7 +19,7 @@ import sys
TRAIN_FILES = ['train.recordio'] TRAIN_FILES = ['train.recordio']
TEST_FILES = ['test.recordio'] TEST_FILES = ['test.recordio']
DICT_DIM = 89528 DICT_DIM = 5147
# embedding dim # embedding dim
emb_dim = 128 emb_dim = 128
...@@ -27,58 +27,46 @@ emb_dim = 128 ...@@ -27,58 +27,46 @@ emb_dim = 128
# hidden dim # hidden dim
hid_dim = 128 hid_dim = 128
# hidden dim2
hid_dim2 = 96
# class num # class num
class_dim = 2 class_dim = 2
# epoch num
epoch_num = 10
def network_cfg(is_train, pass_num=100):
with fluid.unique_name.guard():
train_file_obj = fluid.layers.open_files(
filenames=TRAIN_FILES,
pass_num=pass_num,
shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'])
test_file_obj = fluid.layers.open_files( def build_program(is_train):
filenames=TEST_FILES, file_obj_handle = fluid.layers.io.open_files(
pass_num=1, filenames=TRAIN_FILES if is_train else TEST_FILES,
shapes=[[-1, 1], [-1, 1]], shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0], lod_levels=[1, 0],
dtypes=['int64', 'int64']) dtypes=['int64', 'int64'])
if is_train: file_obj = fluid.layers.io.double_buffer(file_obj_handle)
file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
else:
file_obj = test_file_obj
file_obj = fluid.layers.double_buffer( with fluid.unique_name.guard():
file_obj,
name="train_double_buffer" if is_train else 'test_double_buffer')
data, label = fluid.layers.read_file(file_obj) data, label = fluid.layers.read_file(file_obj)
emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim]) emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
# sequence conv with window size = 3
win_size = 3
conv_3 = fluid.nets.sequence_conv_pool( conv_3 = fluid.nets.sequence_conv_pool(
input=emb, input=emb,
num_filters=hid_dim, num_filters=hid_dim,
filter_size=win_size, filter_size=3,
act="tanh", act="tanh",
pool_type="max") pool_type="sqrt")
# fc layer after conv conv_4 = fluid.nets.sequence_conv_pool(
fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) input=emb,
num_filters=hid_dim,
filter_size=4,
act="tanh",
pool_type="sqrt")
# probability of each class prediction = fluid.layers.fc(input=[conv_3, conv_4],
prediction = fluid.layers.fc(input=[fc_1],
size=class_dim, size=class_dim,
act="softmax") act="softmax")
# cross entropy loss # cross entropy loss
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
...@@ -88,58 +76,62 @@ def network_cfg(is_train, pass_num=100): ...@@ -88,58 +76,62 @@ def network_cfg(is_train, pass_num=100):
if is_train: if is_train:
# SGD optimizer # SGD optimizer
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return { return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
'loss': avg_cost,
'log': [avg_cost, acc],
'file': train_file_obj if is_train else test_file_obj
}
def main(): def main():
train = fluid.Program() train = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
test = fluid.Program()
with fluid.program_guard(train, startup): with fluid.program_guard(train, startup):
train_args = network_cfg(is_train=True) train_args = build_program(is_train=True)
test = fluid.Program()
with fluid.program_guard(test, fluid.Program()): with fluid.program_guard(test, startup):
test_args = network_cfg(is_train=False) test_args = build_program(is_train=False)
use_cuda = fluid.core.is_compiled_with_cuda()
# startup # startup
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
exe.run(startup) exe.run(startup)
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=train_args['loss'].name, main_program=train) use_cuda=use_cuda,
loss_name=train_args['loss'].name,
main_program=train)
test_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
fetch_var_list = [var.name for var in train_args['log']] fetch_var_list = [var.name for var in train_args['log']]
for i in xrange(sys.maxint): for epoch_id in range(epoch_num):
result = map(numpy.array, # train
train_exe.run(fetch_list=fetch_var_list try:
if i % 1000 == 0 else [])) batch_id = 0
if len(result) != 0: while True:
print 'Train: ', result loss, acc = map(numpy.array,
train_exe.run(fetch_list=fetch_var_list))
if i % 1000 == 0: print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
test_exe = fluid.ParallelExecutor( batch_id += 1
use_cuda=True, main_program=test, share_vars_from=train_exe) except fluid.core.EOFException:
print 'End of epoch', epoch_id
train_args['file'].reset()
# test
loss = [] loss = []
acc = [] acc = []
try: try:
while True: while True:
loss_np, acc_np = map( loss_np, acc_np = map(numpy.array,
numpy.array, test_exe.run(fetch_list=fetch_var_list)) test_exe.run(fetch_list=fetch_var_list))
loss.append(loss_np[0]) loss.append(loss_np[0])
acc.append(acc_np[0]) acc.append(acc_np[0])
except: except:
test_args['file'].reset() test_args['file'].reset()
print 'TEST: ', numpy.mean(loss), numpy.mean(acc) print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog): ...@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone() prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name).set_error_clip( prog_clip.block(0).var(hidden1.name)._set_error_clip(
fluid.clip.ErrorClipByValue( fluid.clip.ErrorClipByValue(
max=CLIP_MAX, min=CLIP_MIN)) max=CLIP_MAX, min=CLIP_MIN))
......
...@@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor ...@@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import MomentumOptimizer from paddle.fluid.optimizer import MomentumOptimizer
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.control_flow import split_lod_tensor
from paddle.fluid.layers.control_flow import merge_lod_tensor
from paddle.fluid.layers.control_flow import ConditionalBlock
import unittest import unittest
import numpy as np import numpy as np
...@@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase): ...@@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase):
limit = layers.fill_constant(shape=[1], dtype='int64', value=5) limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
cond = layers.less_than(x=label, y=limit) cond = layers.less_than(x=label, y=limit)
true_image, false_image = layers.split_lod_tensor( true_image, false_image = split_lod_tensor(input=image, mask=cond)
input=image, mask=cond)
true_out = layers.create_tensor(dtype='float32') true_out = layers.create_tensor(dtype='float32')
true_cond = layers.ConditionalBlock([cond]) true_cond = ConditionalBlock([cond])
with true_cond.block(): with true_cond.block():
hidden = layers.fc(input=true_image, size=100, act='tanh') hidden = layers.fc(input=true_image, size=100, act='tanh')
...@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase): ...@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase):
layers.assign(input=prob, output=true_out) layers.assign(input=prob, output=true_out)
false_out = layers.create_tensor(dtype='float32') false_out = layers.create_tensor(dtype='float32')
false_cond = layers.ConditionalBlock([cond]) false_cond = ConditionalBlock([cond])
with false_cond.block(): with false_cond.block():
hidden = layers.fc(input=false_image, size=200, act='tanh') hidden = layers.fc(input=false_image, size=200, act='tanh')
prob = layers.fc(input=hidden, size=10, act='softmax') prob = layers.fc(input=hidden, size=10, act='softmax')
layers.assign(input=prob, output=false_out) layers.assign(input=prob, output=false_out)
prob = layers.merge_lod_tensor( prob = merge_lod_tensor(
in_true=true_out, in_false=false_out, mask=cond, x=image) in_true=true_out, in_false=false_out, mask=cond, x=image)
loss = layers.cross_entropy(input=prob, label=label) loss = layers.cross_entropy(input=prob, label=label)
avg_loss = layers.mean(loss) avg_loss = layers.mean(loss)
......
...@@ -251,7 +251,7 @@ class OpTest(unittest.TestCase): ...@@ -251,7 +251,7 @@ class OpTest(unittest.TestCase):
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
fetch_list.append(str(out_name)) fetch_list.append(str(out_name))
# fetch_list = map(block.var, fetch_list) # fetch_list = map(block.var, fetch_list)
if not isinstance(fetch_list[0], Variable): if not isinstance(fetch_list[0], fluid.framework.Variable):
fetch_list = map(block.var, fetch_list) fetch_list = map(block.var, fetch_list)
outs = executor.run(program, outs = executor.run(program,
feed=feed_map, feed=feed_map,
......
...@@ -18,14 +18,15 @@ import paddle.fluid.core as core ...@@ -18,14 +18,15 @@ import paddle.fluid.core as core
from paddle.fluid.framework import default_startup_program, default_main_program from paddle.fluid.framework import default_startup_program, default_main_program
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import ConditionalBlock
import numpy import numpy
class ConditionalBlock(unittest.TestCase): class ConditionalBlockTest(unittest.TestCase):
def test_forward(self): def test_forward(self):
data = layers.data(name='X', shape=[1], dtype='float32') data = layers.data(name='X', shape=[1], dtype='float32')
data.stop_gradient = False data.stop_gradient = False
cond = layers.ConditionalBlock(inputs=[data]) cond = ConditionalBlock(inputs=[data])
out = layers.create_tensor(dtype='float32') out = layers.create_tensor(dtype='float32')
with cond.block(): with cond.block():
hidden = layers.fc(input=data, size=10) hidden = layers.fc(input=data, size=10)
......
...@@ -16,7 +16,7 @@ import unittest ...@@ -16,7 +16,7 @@ import unittest
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
class ConditionalBlock(unittest.TestCase): class ConstantTest(unittest.TestCase):
def test_const_value(self): def test_const_value(self):
self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD") self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@") self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
......
...@@ -17,6 +17,12 @@ import paddle ...@@ -17,6 +17,12 @@ import paddle
import unittest import unittest
import numpy import numpy
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.layers.control_flow import max_sequence_len
from paddle.fluid.layers.control_flow import lod_tensor_to_array
from paddle.fluid.layers.control_flow import array_to_lod_tensor
from paddle.fluid.layers.control_flow import shrink_memory
class TestDynRNN(unittest.TestCase): class TestDynRNN(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase): ...@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase):
label = fluid.layers.data(name='label', shape=[1], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='float32')
rank_table = fluid.layers.lod_rank_table(x=sent_emb) rank_table = lod_rank_table(x=sent_emb)
sent_emb_array = fluid.layers.lod_tensor_to_array( sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
x=sent_emb, table=rank_table)
seq_len = fluid.layers.max_sequence_len(rank_table=rank_table) seq_len = max_sequence_len(rank_table=rank_table)
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
i.stop_gradient = False i.stop_gradient = False
...@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase):
mem = fluid.layers.array_read(array=mem_array, i=i) mem = fluid.layers.array_read(array=mem_array, i=i)
ipt = fluid.layers.array_read(array=sent_emb_array, i=i) ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) mem = shrink_memory(x=mem, i=i, table=rank_table)
hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
...@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase):
fluid.layers.array_write(x=hidden, i=i, array=mem_array) fluid.layers.array_write(x=hidden, i=i, array=mem_array)
fluid.layers.less_than(x=i, y=seq_len, cond=cond) fluid.layers.less_than(x=i, y=seq_len, cond=cond)
all_timesteps = fluid.layers.array_to_lod_tensor( all_timesteps = array_to_lod_tensor(x=out, table=rank_table)
x=out, table=rank_table)
last = fluid.layers.sequence_last_step(input=all_timesteps) last = fluid.layers.sequence_last_step(input=all_timesteps)
logits = fluid.layers.fc(input=last, size=1, act=None) logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits( loss = fluid.layers.sigmoid_cross_entropy_with_logits(
......
...@@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase): ...@@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase):
def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn, def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
kwargs): kwargs):
main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog):
decayed_lr = fluid_decay_fn(**kwargs) decayed_lr = fluid_decay_fn(**kwargs)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(startup_prog)
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(main_prog)
for step in range(10): for step in range(10):
lr_val, = exe.run(fluid.default_main_program(), lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
feed={},
fetch_list=[decayed_lr])
python_decayed_lr = python_decay_fn( python_decayed_lr = python_decay_fn(
global_step=float(step), **kwargs) global_step=float(step), **kwargs)
self.assertAlmostEqual( self.assertAlmostEqual(
......
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle.fluid.layers import lod_rank_table, data from paddle.fluid.layers import data
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
......
...@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard ...@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.layers.control_flow import max_sequence_len
from paddle.fluid.layers.control_flow import lod_tensor_to_array
from paddle.fluid.layers.control_flow import array_to_lod_tensor
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
def place(self): def place(self):
...@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): ...@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
with program_guard(program): with program_guard(program):
x = layers.data(name='x', shape=[10]) x = layers.data(name='x', shape=[10])
x.persistable = True x.persistable = True
table = layers.lod_rank_table(x, level=level) table = lod_rank_table(x, level=level)
max_len = layers.max_sequence_len(table) max_len = max_sequence_len(table)
max_len.persistable = True max_len.persistable = True
array = layers.lod_tensor_to_array(x, table) array = lod_tensor_to_array(x, table)
array.persistable = True array.persistable = True
result = layers.array_to_lod_tensor(array, table) result = array_to_lod_tensor(array, table)
result.persistable = True result.persistable = True
exe = Executor(place) exe = Executor(place)
scope = core.Scope() scope = core.Scope()
...@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): ...@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
with program_guard(program): with program_guard(program):
x = layers.data( x = layers.data(
name='x', shape=[1], dtype='float32', stop_gradient=False) name='x', shape=[1], dtype='float32', stop_gradient=False)
table = layers.lod_rank_table(x, level=0) table = lod_rank_table(x, level=0)
array = layers.lod_tensor_to_array(x, table) array = lod_tensor_to_array(x, table)
result = layers.array_to_lod_tensor(array, table) result = array_to_lod_tensor(array, table)
mean = layers.mean(result) mean = layers.mean(result)
......
...@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase):
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
return img, label return img, label
# simple_fc def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence( self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) model, use_cuda=use_cuda, use_reduce=True)
img, label = self._init_data()
self.check_network_convergence( self.check_network_convergence(
simple_fc_net, model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def check_simple_fc_convergence_with_Reduce(self, use_cuda): img, label = self._init_data(random_data)
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, use_reduce=True)
self.check_network_convergence(
simple_fc_net,
use_cuda=use_cuda,
allow_op_delay=True,
use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
simple_fc_net, model,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=False) use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss = self.check_network_convergence(
simple_fc_net, model,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
...@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase):
for loss in zip(all_reduce_first_loss, reduce_first_loss): for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss): for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
img, label = self._init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def test_simple_fc(self): def test_simple_fc(self):
# use_cuda # use_cuda
...@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc_with_new_strategy(self): def test_simple_fc_with_new_strategy(self):
# use_cuda, use_reduce # use_cuda, use_reduce
self.check_simple_fc_convergence_with_Reduce(True) self._compare_reduce_and_allreduce(simple_fc_net, True)
self.check_simple_fc_convergence_with_Reduce(False) self._compare_reduce_and_allreduce(simple_fc_net, False)
def check_simple_fc_parallel_accuracy(self, use_cuda): def check_simple_fc_parallel_accuracy(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
...@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase):
"label": label}, "label": label},
use_cuda=use_cuda) use_cuda=use_cuda)
def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=True)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(True) self.check_batchnorm_fc_convergence(True)
self.check_batchnorm_fc_convergence(False) self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
self.check_batchnorm_fc_convergence_use_reduce(True) self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self.check_batchnorm_fc_convergence_use_reduce(False) self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase):
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
data = next(generator) data = next(generator)
if isinstance(data, fluid.Variable): if isinstance(data, fluid.framework.Variable):
data = [data] data = [data]
with pd.do(): with pd.do():
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.layers.control_flow import lod_rank_table
import numpy import numpy
...@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase):
dat.stop_gradient = False dat.stop_gradient = False
rank_dat = fluid.layers.data( rank_dat = fluid.layers.data(
name=cls.data_desc[1][0], shape=cls.data_desc[1][1]) name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
table = fluid.layers.lod_rank_table(rank_dat) table = lod_rank_table(rank_dat)
new_dat = fluid.layers.reorder_lod_tensor_by_rank( new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table) x=dat, rank_table=table)
loss = fluid.layers.reduce_sum(new_dat) loss = fluid.layers.reduce_sum(new_dat)
......
...@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program ...@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program
from paddle.fluid.framework import Program from paddle.fluid.framework import Program
import numpy as np import numpy as np
from paddle.fluid.layers.control_flow import shrink_memory
from paddle.fluid.layers.control_flow import lod_rank_table
class TestShrinkRNNMemoryBase(unittest.TestCase): class TestShrinkRNNMemoryBase(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): ...@@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
x.stop_gradient = False x.stop_gradient = False
rank_table_tensor = layers.data( rank_table_tensor = layers.data(
'rank_table_tensor', shape=[1], dtype='float32', lod_level=1) 'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
table = layers.lod_rank_table(x=rank_table_tensor) table = lod_rank_table(x=rank_table_tensor)
i = layers.zeros(dtype='int64', shape=[1]) i = layers.zeros(dtype='int64', shape=[1])
self.mem1 = layers.shrink_memory(x=x, i=i, table=table) self.mem1 = shrink_memory(x=x, i=i, table=table)
i = layers.increment(x=i) i = layers.increment(x=i)
i.stop_gradient = True i.stop_gradient = True
self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table) self.mem2 = shrink_memory(x=self.mem1, i=i, table=table)
i = layers.increment(x=i) i = layers.increment(x=i)
i.stop_gradient = True i.stop_gradient = True
self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table) self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
mem3_mean = layers.mean(self.mem3) mem3_mean = layers.mean(self.mem3)
append_backward(loss=mem3_mean) append_backward(loss=mem3_mean)
self.x_grad = self.main_program.global_block().var('x@GRAD') self.x_grad = self.main_program.global_block().var('x@GRAD')
......
...@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers ...@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers
from paddle.fluid.framework import Program, program_guard from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import split_lod_tensor
from paddle.fluid.layers.control_flow import merge_lod_tensor
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): ...@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
y = layers.data(name='y', shape=[1]) y = layers.data(name='y', shape=[1])
y.persistable = True y.persistable = True
out_true, out_false = layers.split_lod_tensor( out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
input=x, mask=y, level=level)
out_true.persistable = True out_true.persistable = True
out_false.persistable = True out_false.persistable = True
out = layers.merge_lod_tensor( out = merge_lod_tensor(
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
out.persistable = True out.persistable = True
...@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): ...@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
level = 0 level = 0
out_true, out_false = layers.split_lod_tensor( out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
input=x, mask=y, level=level) out = merge_lod_tensor(
out = layers.merge_lod_tensor(
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
mean = layers.mean(out) mean = layers.mean(out)
......
...@@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher ...@@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, \
Variable, Parameter, grad_var_name Parameter, grad_var_name
from details import * from details import *
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
...@@ -918,7 +918,8 @@ class DistributeTranspiler(object): ...@@ -918,7 +918,8 @@ class DistributeTranspiler(object):
# create table optimize block in pserver program # create table optimize block in pserver program
table_opt_op = [ table_opt_op = [
op for op in self.optimize_ops op for op in self.optimize_ops
if op.input("Param")[0] == self.table_name if 'Param' in op.input_names and op.input("Param")[0] ==
self.table_name
][0] ][0]
table_opt_block = pserver_program.create_block(pre_block_idx) table_opt_block = pserver_program.create_block(pre_block_idx)
# only support sgd now # only support sgd now
...@@ -1075,7 +1076,6 @@ class DistributeTranspiler(object): ...@@ -1075,7 +1076,6 @@ class DistributeTranspiler(object):
] ]
def _clone_var(self, block, var, persistable=True): def _clone_var(self, block, var, persistable=True):
assert isinstance(var, Variable)
return block.create_var( return block.create_var(
name=var.name, name=var.name,
shape=var.shape, shape=var.shape,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from collections import defaultdict from collections import defaultdict
from .. import core from .. import core
from ..framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter
from ..backward import _rename_arg_ from ..backward import _rename_arg_
dtype_to_size = { dtype_to_size = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册