提交 018e2f3a 编写于 作者: T tangwei12

Merge branch 'dis_ckpt_fix' of github.com:seiriosPlus/Paddle into dis_ckpt_fix

paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False))
paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
...@@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non ...@@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
...@@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def ...@@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...@@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None ...@@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...@@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', ...@@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs',
paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
......
...@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { ...@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() {
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
void Executor::BeginPass() {
::paddle::operators::distributed::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>() ::paddle::operators::distributed::GRPCClient>()
->SendBeginPass(); ->SendComplete();
}
void Executor::EndPass() {
::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>()
->SendEndPass();
}
#endif #endif
}
void InitializeVariable(Variable* var, proto::VarType::Type var_type) { void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) { if (var_type == proto::VarType::LOD_TENSOR) {
......
...@@ -44,17 +44,11 @@ class Executor { ...@@ -44,17 +44,11 @@ class Executor {
explicit Executor(const platform::Place& place); explicit Executor(const platform::Place& place);
#ifdef PADDLE_WITH_DISTRIBUTE
/* /*
* Sending signal to pserver to mark current pass started. * Close this Executor.
* Calling this method will send complete messages to all pserver instances.
*/ */
void BeginPass(); void Close();
/*
* Sending signal to pserver to mark current pass finished.
*/
void EndPass();
#endif
/* @Brief /* @Brief
* Runtime evaluation of the given ProgramDesc under certain Scope * Runtime evaluation of the given ProgramDesc under certain Scope
......
...@@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
executor_->RunPreparedContext( executor_->RunPreparedContext(
ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(), ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
&feed_targets, &fetch_targets, &feed_targets, &fetch_targets,
false, /* don't create local scope each time*/
false /* don't create variable eatch time */); false /* don't create variable eatch time */);
VLOG(4) << "Finish prepared context"; VLOG(4) << "Finish prepared context";
if (!GetFetch(fetchs, output_data)) { if (!GetFetch(fetchs, output_data)) {
......
...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, ...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
for (int h = 0; h < shape.h(); ++h) { for (int h = 0; h < shape.h(); ++h) {
for (int w = 0; w < shape.w(); ++w) { for (int w = 0; w < shape.w(); ++w) {
odata[h * ostrides.h() + w * ostrides.w()] = odata[h * ostrides.h() + w * ostrides.w()] =
idata[h * ostrides.h() + w * ostrides.w()]; idata[h * istrides.h() + w * istrides.w()];
} }
} }
} }
// indata c * k
// Reorder the data layout from CK to KC. // Reorder the data layout from CK to KC.
void ReorderCKtoKC(TensorRTEngine::Weight& iweights, void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
TensorRTEngine::Weight* oweights) { TensorRTEngine::Weight* oweights) {
...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter { ...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter {
framework::LoDTensor tmp; framework::LoDTensor tmp;
tmp.Resize(Y_t->dims()); tmp.Resize(Y_t->dims());
memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(), memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
Y_t->dims()[0] * Y_t->dims()[1]); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; Y_t->memory_size() / sizeof(float)};
...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter { ...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter {
// The data layout of TRT FC layer's weight is different from fluid's FC, // The data layout of TRT FC layer's weight is different from fluid's FC,
// need to reorder the elements. // need to reorder the elements.
ReorderCKtoKC(tmp_weight, &weight); ReorderCKtoKC(weight, &tmp_weight);
// Currently, the framework can only handle one fluid op -> one TRT layer, // Currently, the framework can only handle one fluid op -> one TRT layer,
// but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter { ...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter {
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
*const_cast<nvinfer1::ITensor*>(X), *const_cast<nvinfer1::ITensor*>(X),
n_output, weight.get(), bias.get()); n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter { ...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter {
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
USE_OP(mul);
...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { ...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -23,11 +23,11 @@ namespace tensorrt { ...@@ -23,11 +23,11 @@ namespace tensorrt {
TEST(fc_op, test) { TEST(fc_op, test) {
std::unordered_set<std::string> parameters({"mul-Y"}); std::unordered_set<std::string> parameters({"mul-Y"});
framework::Scope scope; framework::Scope scope;
TRTConvertValidation validator(20, parameters, scope, 1000); TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
// Prepare Op description // Prepare Op description
framework::OpDesc desc; framework::OpDesc desc;
...@@ -38,9 +38,10 @@ TEST(fc_op, test) { ...@@ -38,9 +38,10 @@ TEST(fc_op, test) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul);
...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { ...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -39,7 +39,7 @@ namespace tensorrt { ...@@ -39,7 +39,7 @@ namespace tensorrt {
float random(float low, float high) { float random(float low, float high) {
static std::random_device rd; static std::random_device rd;
static std::mt19937 mt(rd()); static std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(1.0, 10.0); std::uniform_real_distribution<double> dist(low, high);
return dist(mt); return dist(mt);
} }
...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, ...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
size_t num_elements = analysis::AccuDims(dims, dims.size()); size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0); PADDLE_ENFORCE_GT(num_elements, 0);
auto* data = tensor->mutable_data<float>(place); auto* data = tensor->mutable_data<float>(place);
for (size_t i = 0; i < num_elements; i++) { for (size_t i = 0; i < num_elements; i++) {
*(data + i) = random(0., 1.); *(data + i) = random(0., 1.);
} }
...@@ -68,7 +69,7 @@ class TRTConvertValidation { ...@@ -68,7 +69,7 @@ class TRTConvertValidation {
int workspace_size = 1 << 10) int workspace_size = 1 << 10)
: parameters_(parameters), scope_(scope) { : parameters_(parameters), scope_(scope) {
// create engine. // create engine.
engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
engine_->InitNetwork(); engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
...@@ -138,12 +139,11 @@ class TRTConvertValidation { ...@@ -138,12 +139,11 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 200; const size_t output_space_size = 2000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
output_space_size * sizeof(float));
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License"); you may not use
you may not use this file except in compliance with the License. this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -26,6 +26,8 @@ namespace paddle { ...@@ -26,6 +26,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::Build(const DescType &paddle_model) { void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) {
PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() {
auto dims = infer_engine_->getBindingDimensions(slot_offset); auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>( item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] * infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0); PADDLE_ENFORCE_GT(item.second, 0);
} }
auto &buf = buffer(item.first); auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_; buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once. CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
// buf.size will changed in the runtime.
buf.size = 0; buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU; buf.device = DeviceType::GPU;
} }
} }
...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, ...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
auto *input = infer_network_->addInput(name.c_str(), dtype, dims); auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE(input->isNetworkInput()); PADDLE_ENFORCE(input->isNetworkInput());
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { ...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, *stream_),
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
VLOG(4) << "get output in cpu";
auto &buf = buffer(name);
// Update needed buffer size.
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
buf.size = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
// DEBUG PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
memset(dst, 0, buf.size); cudaMemcpyDeviceToHost, *stream_));
PADDLE_ENFORCE_EQ(
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
} }
Buffer &TensorRTEngine::buffer(const std::string &name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { ...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
return itensor_map_[name]; return itensor_map_[name];
} }
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase { ...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
// the runtime batch size
static int runtime_batch_;
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
......
...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(1, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu, x_v * 2 + 3);
...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions(); auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1); ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3});
auto* conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
engine_->DeclareOutput(conv_layer, 0, "y");
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
18 * sizeof(float));
engine_->Execute(2);
LOG(INFO) << "to get output";
float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname, ...@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname,
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx; std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
bool CreateLocalScope = CreateVars;
if (PrepareContext) { if (PrepareContext) {
ctx = executor.Prepare(*inference_program, 0); ctx = executor.Prepare(*inference_program, 0);
executor.RunPreparedContext(ctx.get(), scope, &feed_targets, executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
&fetch_targets, true, CreateVars); &fetch_targets, CreateLocalScope, CreateVars);
} else { } else {
executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
true, CreateVars); CreateLocalScope, CreateVars);
} }
// Enable the profiler // Enable the profiler
...@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname, ...@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname,
// Note: if you change the inference_program, you need to call // Note: if you change the inference_program, you need to call
// executor.Prepare() again to get a new ExecutorPrepareContext. // executor.Prepare() again to get a new ExecutorPrepareContext.
executor.RunPreparedContext(ctx.get(), scope, &feed_targets, executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
&fetch_targets, CreateVars); &fetch_targets, CreateLocalScope,
CreateVars);
} else { } else {
executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
CreateVars); CreateLocalScope, CreateVars);
} }
} }
......
...@@ -18,7 +18,7 @@ if(WITH_GRPC) ...@@ -18,7 +18,7 @@ if(WITH_GRPC)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(grpc_serde_test SRCS grpc_serde_test.cc cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(grpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL)
return() return()
endif() endif()
......
...@@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() { ...@@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() {
client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
} }
void GRPCClient::SendBeginPass() { void GRPCClient::SendComplete() {
for (auto& it : channels_) { std::unique_lock<std::mutex> lk(completed_mutex_);
VLOG(3) << "send begin pass to: " << it.first; if (!completed_) {
this->AsyncSendBeginPass(it.first); for (auto& it : channels_) {
} VLOG(3) << "send complete message to " << it.first;
this->Wait(); this->AsyncSendComplete(it.first);
} }
PADDLE_ENFORCE(this->Wait(), "internal grpc error");
void GRPCClient::SendEndPass() { completed_ = true;
for (auto& it : channels_) {
VLOG(3) << "send end pass to " << it.first;
this->AsyncSendEndPass(it.first);
} }
this->Wait();
} }
GRPCClient::~GRPCClient() { GRPCClient::~GRPCClient() {
...@@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
req_count_++; req_count_++;
} }
void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) { void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out); s->Prepare(time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BEGIN_PASS_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
} }
void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
s->Prepare(time_out);
sendrecv::VariableMessage req;
req.set_varname(END_PASS_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
}
void GRPCClient::AsyncCheckpointNotify(const std::string& ep, void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir, const std::string& dir,
int64_t time_out) { int64_t time_out) {
......
...@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { ...@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class GRPCClient : public RPCClient { class GRPCClient : public RPCClient {
public: public:
GRPCClient() : ok_(true) {} GRPCClient() : ok_(true), completed_(false) {}
virtual ~GRPCClient(); virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
...@@ -201,17 +201,12 @@ class GRPCClient : public RPCClient { ...@@ -201,17 +201,12 @@ class GRPCClient : public RPCClient {
void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBeginPass(const std::string& ep, void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendEndPass(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
bool Wait() override; bool Wait() override;
void SendBeginPass() override; void SendComplete() override;
void SendEndPass() override;
protected: protected:
void InitImpl() override; void InitImpl() override;
...@@ -238,6 +233,10 @@ class GRPCClient : public RPCClient { ...@@ -238,6 +233,10 @@ class GRPCClient : public RPCClient {
// mutex for GetChannel thread safety // mutex for GetChannel thread safety
std::mutex chan_mutex_; std::mutex chan_mutex_;
DISABLE_COPY_AND_ASSIGN(GRPCClient); DISABLE_COPY_AND_ASSIGN(GRPCClient);
// mutex for sending complete message only once
std::mutex completed_mutex_;
bool completed_;
}; };
} // namespace distributed } // namespace distributed
......
...@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; ...@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define COMPLETE_MESSAGE "COMPLETE@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV"
#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
#define END_PASS_MESSAGE "END_PASS@RECV"
#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
......
...@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
if (varname == BATCH_BARRIER_MESSAGE) { if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend); rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == BEGIN_PASS_MESSAGE) { } else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv begin pass message"; VLOG(3) << "sync: recv complete message";
rpc_server_->WaitCond(kRequestSend); rpc_server_->Complete();
rpc_server_->BeginPass();
} else { } else {
VLOG(3) << "sync: received var_name: " << varname; VLOG(3) << "sync: received var_name: " << varname;
rpc_server_->WaitCond(kRequestSend); rpc_server_->WaitCond(kRequestSend);
...@@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
if (varname == FETCH_BARRIER_MESSAGE) { if (varname == FETCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv fetch barrier message"; VLOG(3) << "sync: recv fetch barrier message";
rpc_server_->IncreaseBatchBarrier(kRequestGet); rpc_server_->IncreaseBatchBarrier(kRequestGet);
} else if (varname == END_PASS_MESSAGE) {
rpc_server_->EndPass();
} else { } else {
rpc_server_->WaitCond(kRequestGet); rpc_server_->WaitCond(kRequestGet);
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
} }
} else { } else {
if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) { if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
} }
} }
......
...@@ -60,17 +60,13 @@ class RPCClient { ...@@ -60,17 +60,13 @@ class RPCClient {
const std::string& dir, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendBeginPass(const std::string& ep, virtual void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendEndPass(const std::string& ep, // Complete tells all the pserver instances that finishe the training,
int64_t time_out = FLAGS_rpc_deadline) = 0; // the pserver can reduce it's barrier count, and continue to train
// BeginePass/EndPass tells all the pserver that start/end a pass, so that
// the pserver can increase/reduce it's barrier count, and continue to train
// with other trainers. // with other trainers.
virtual void SendBeginPass() = 0; virtual void SendComplete() = 0;
virtual void SendEndPass() = 0;
virtual bool Wait() = 0; virtual bool Wait() = 0;
......
...@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { ...@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
} }
} }
void RPCServer::BeginPass() { void RPCServer::Complete() {
VLOG(4) << "RPCServer begin increase pass barrier";
{
std::unique_lock<std::mutex> lock(mutex_);
client_num_++;
VLOG(4) << "increase client_num to: " << client_num_;
}
barrier_cond_.notify_all();
}
void RPCServer::EndPass() {
VLOG(4) << "RPCServer begin increase pass barrier";
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
client_num_--; client_num_--;
...@@ -87,6 +76,11 @@ void RPCServer::EndPass() { ...@@ -87,6 +76,11 @@ void RPCServer::EndPass() {
barrier_cond_.notify_all(); barrier_cond_.notify_all();
} }
int RPCServer::GetClientNum() {
std::unique_lock<std::mutex> lock(mutex_);
return client_num_;
}
void RPCServer::ResetBarrierCounter() { void RPCServer::ResetBarrierCounter() {
VLOG(3) << "RPCServer ResetBarrierCounter "; VLOG(3) << "RPCServer ResetBarrierCounter ";
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
......
...@@ -44,7 +44,7 @@ class RPCServer { ...@@ -44,7 +44,7 @@ class RPCServer {
int GetSelectedPort() const { return selected_port_; } int GetSelectedPort() const { return selected_port_; }
int GetClientNum() const; int GetClientNum();
void SavePort() const; void SavePort() const;
...@@ -64,8 +64,7 @@ class RPCServer { ...@@ -64,8 +64,7 @@ class RPCServer {
void WaitCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name);
void IncreaseBatchBarrier(const std::string rpc_name); void IncreaseBatchBarrier(const std::string rpc_name);
void BeginPass(); void Complete();
void EndPass();
void ResetBarrierCounter(); void ResetBarrierCounter();
......
...@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, ...@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
} }
} }
void StartServer() { void StartServer(const std::string& rpc_name) {
framework::ProgramDesc program; framework::ProgramDesc program;
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CPUPlace place;
...@@ -107,14 +107,14 @@ void StartServer() { ...@@ -107,14 +107,14 @@ void StartServer() {
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
prefetch_var_name_to_prepared; prefetch_var_name_to_prepared;
prefetch_var_name_to_prepared[in_var_name] = prepared[0]; prefetch_var_name_to_prepared[in_var_name] = prepared[0];
g_req_handler->SetProgram(&program); g_req_handler->SetProgram(&program);
g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
g_req_handler->SetDevCtx(&ctx); g_req_handler->SetDevCtx(&ctx);
g_req_handler->SetScope(&scope); g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe); g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(distributed::kRequestPrefetch, g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
...@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) { ...@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) {
distributed::RPCClient* client = distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::thread server_thread(StartServer); std::thread server_thread(StartServer, distributed::kRequestPrefetch);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort(); int port = g_rpc_service->GetSelectedPort();
...@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) { ...@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) {
g_rpc_service.reset(nullptr); g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr); g_req_handler.reset(nullptr);
} }
TEST(COMPLETE, CPU) {
g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
PADDLE_ENFORCE(client != nullptr);
std::thread server_thread(StartServer, distributed::kRequestSend);
g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort();
std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
client->AsyncSendComplete(ep);
client->Wait();
EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
g_rpc_service->ShutDown();
server_thread.join();
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
...@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue { ...@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue {
public: public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) { bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(lod_tensor_vec); return queue_.Send(lod_tensor_vec);
} }
bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) { bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(std::move(lod_tensor_vec)); return queue_.Send(std::move(lod_tensor_vec));
} }
...@@ -65,21 +63,6 @@ class LoDTensorBlockingQueue { ...@@ -65,21 +63,6 @@ class LoDTensorBlockingQueue {
inline bool IsClosed() const { return queue_.IsClosed(); } inline bool IsClosed() const { return queue_.IsClosed(); }
private: private:
void CheckDims(
const std::vector<framework::LoDTensor>& lod_tensor_vec) const {
PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
"Expect input size is %d but found %s", dims_.size(),
lod_tensor_vec.size());
for (size_t i = 0; i < dims_.size(); ++i) {
const auto& in_dims = framework::slice_ddim(
lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
const auto& expect_dims =
framework::slice_ddim(dims_[i], 1, dims_[i].size());
PADDLE_ENFORCE(in_dims == expect_dims,
"Dims of the %d-th input tensor do not match", i);
}
}
BlockingQueue<std::vector<framework::LoDTensor>> queue_; BlockingQueue<std::vector<framework::LoDTensor>> queue_;
std::vector<framework::DDim> dims_; std::vector<framework::DDim> dims_;
}; };
......
...@@ -216,7 +216,7 @@ class ReshapeKernel { ...@@ -216,7 +216,7 @@ class ReshapeKernel {
if (shape_tensor) { if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>(); auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor; framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(shape_tensor->place())) {
TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
} }
......
...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions"); "TensorRT' tensor input requires at most 4 dimensions");
switch (shape.size()) { switch (shape.size()) {
case 2: case 2:
return nvinfer1::Dims2(shape[0], shape[1]); return nvinfer1::Dims2(1, shape[1]);
case 3: case 3:
return nvinfer1::Dims3(shape[0], shape[1], shape[2]); return nvinfer1::Dims3(1, shape[1], shape[2]);
case 4: case 4:
return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
default: default:
return nvinfer1::Dims(); return nvinfer1::Dims();
} }
......
...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y); auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
fluid_t->Resize(framework::make_ddim(ddim)); fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the // TODO(Superjomn) find some way to determine which device to output the
// tensor. // tensor.
// if (platform::is_cpu_place(fluid_t->place())) { // if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(y, engine->GetOutputInCPU(y,
fluid_t->mutable_data<float>(platform::CPUPlace()), fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float)); size * sizeof(float));
......
...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) { ...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) {
LOG(INFO) << "create block desc"; LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_); framework::BlockDesc block_desc(&program, block_);
LOG(INFO) << "create mul op"; LOG(INFO) << "create fc op";
auto* mul = block_desc.AppendOp(); auto* fc0 = block_desc.AppendOp();
mul->SetType("mul"); fc0->SetType("fc");
mul->SetInput("X", std::vector<std::string>({"x"})); // 2 x 4 fc0->SetInput("X", std::vector<std::string>({"x"})); // 4 x 1 x 1
mul->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6 fc0->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6
mul->SetOutput("Out", std::vector<std::string>({"z"})); // 2 x 6 fc0->SetOutput("Out", std::vector<std::string>({"z"})); // 6 x 1 x 1
LOG(INFO) << "create fc op"; LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp(); auto* fc1 = block_desc.AppendOp();
fc->SetType("mul"); fc1->SetType("fc");
fc->SetInput("X", std::vector<std::string>({"z"})); fc1->SetInput("X", std::vector<std::string>({"z"}));
fc->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8 fc1->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8
fc->SetOutput("Out", std::vector<std::string>({"z0"})); // 2 x 8 fc1->SetOutput("Out", std::vector<std::string>({"z0"})); // 8 x 1 x 1
// Set inputs' variable shape in BlockDesc // Set inputs' variable shape in BlockDesc
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4})); // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6})); AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8})); AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6})); AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
// It is wired, need to copy manually. // It is wired, need to copy manually.
*block_->add_ops() = *mul->Proto(); *block_->add_ops() = *fc0->Proto();
*block_->add_ops() = *fc->Proto(); *block_->add_ops() = *fc1->Proto();
ASSERT_EQ(block_->ops_size(), 2); ASSERT_EQ(block_->ops_size(), 2);
LOG(INFO) << "create tensorrt desc"; LOG(INFO) << "create tensorrt desc";
framework::OpDesc engine_op_desc(nullptr); framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } ...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
USE_TRT_CONVERTER(mul)
USE_TRT_CONVERTER(fc) USE_TRT_CONVERTER(fc)
...@@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::Executor>(m, "Executor") py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
#ifdef PADDLE_WITH_DISTRIBUTE .def("close", &Executor::Close)
.def("begin_pass", &Executor::BeginPass)
.def("end_pass", &Executor::EndPass)
#endif
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) { int block_id, bool create_local_scope, bool create_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
......
...@@ -247,6 +247,7 @@ class Executor(object): ...@@ -247,6 +247,7 @@ class Executor(object):
p.set_place(place) p.set_place(place)
self.executor = core.Executor(p) self.executor = core.Executor(p)
self.program_caches = dict() self.program_caches = dict()
self._closed = False
def as_lodtensor(self, data): def as_lodtensor(self, data):
""" """
...@@ -348,11 +349,23 @@ class Executor(object): ...@@ -348,11 +349,23 @@ class Executor(object):
] ]
return outs return outs
def begin_pass(self): def close(self):
self.executor.begin_pass() """
Close this executor.
def end_pass(self): You can no long use this executor after calling this method.
self.executor.end_pass() For the distributed training, this method would free the resource on PServers related to
the current Trainer.
Example:
>>> cpu = core.CPUPlace()
>>> exe = Executor(cpu)
>>> ...
>>> exe.close()
"""
if not self._closed:
self.executor.close()
self._closed = True
def run(self, def run(self,
program=None, program=None,
...@@ -405,6 +418,10 @@ class Executor(object): ...@@ -405,6 +418,10 @@ class Executor(object):
>>> feed={'X': x}, >>> feed={'X': x},
>>> fetch_list=[loss.name]) >>> fetch_list=[loss.name])
""" """
if self._closed:
raise RuntimeError("Attempted to use a closed Executor")
if feed is None: if feed is None:
feed = {} feed = {}
if not isinstance(feed, dict): if not isinstance(feed, dict):
......
...@@ -32,7 +32,6 @@ except Exception, e: ...@@ -32,7 +32,6 @@ except Exception, e:
import unique_name import unique_name
__all__ = [ __all__ = [
'Variable',
'Program', 'Program',
'Operator', 'Operator',
'Parameter', 'Parameter',
...@@ -302,7 +301,7 @@ class Variable(object): ...@@ -302,7 +301,7 @@ class Variable(object):
__repr__ = __str__ __repr__ = __str__
def set_desc(self, input): def _set_desc(self, input):
""" """
Set the variable description. Set the variable description.
...@@ -347,7 +346,7 @@ class Variable(object): ...@@ -347,7 +346,7 @@ class Variable(object):
def type(self): def type(self):
return self.desc.type() return self.desc.type()
def set_error_clip(self, error_clip): def _set_error_clip(self, error_clip):
""" """
Set the error_clip. Set the error_clip.
......
...@@ -796,104 +796,6 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -796,104 +796,6 @@ def get_parameter_value_by_name(name, executor, program=None):
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
def get_test_program(filelist, program=None, startup_program=None):
"""
Transpile current train program to a program to read test dataset
if the program is using reader ops like "open_files_op".
"""
def _copy_reader_var_(block, var, new_name=None):
if new_name == None:
new_name = var.name
new_var = block.create_var(
name=str(new_name), type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True
return new_var
def _get_test_reader_name(train_reader_name):
return train_reader_name + "_test"
def _is_reader_op(op):
block = op.block
if "Out" in op.output_names:
reader_out = block.vars[op.output("Out")[0]]
if reader_out.type == core.VarDesc.VarType.READER:
return True
return False
if program == None:
program = default_main_program()
if startup_program == None:
startup_program = default_startup_program()
startup_block = startup_program.global_block()
# 1. find out the orignal reader var name
startup_reader_op_list = []
for op in startup_block.ops:
if _is_reader_op(op):
startup_reader_op_list.append(op)
if len(startup_reader_op_list) == 0:
return program
root_reader_op = startup_reader_op_list[0]
train_test_reader_map = {}
# 2. add operators to startup to read open and read test data files
for op in startup_reader_op_list:
assert (len(op.output("Out")) == 1)
train_reader_name = op.output("Out")[0]
train_reader = startup_block.vars[train_reader_name]
test_reader = _copy_reader_var_(
startup_block,
train_reader,
new_name=_get_test_reader_name(train_reader_name))
train_test_reader_map[train_reader.name] = test_reader
test_op_inputs = {}
for name in op.input_names:
train_arg_names = op.input(name)
test_arg_vars = []
for arg_name in train_arg_names:
arg_var = train_test_reader_map[
arg_name] if name == "UnderlyingReader" else startup_block.vars[
arg_name]
test_arg_vars.append(arg_var)
test_op_inputs[name] = test_arg_vars
test_op = startup_block.append_op(
type=op.type,
inputs=test_op_inputs,
outputs={'Out': [test_reader]},
attrs=op.attrs)
# root reader op's filelist attr for read test files
if op.type == root_reader_op.type:
test_op.set_attr("file_names", filelist)
if op.type == "create_multi_pass_reader":
test_op.set_attr("pass_num", 1)
# 3. rename reader vars in inference program to different name
# to avoid read from train data.
main_block = program.global_block()
for var in main_block.vars.values():
if var.type == core.VarDesc.VarType.READER:
main_block._rename_var(
str(var.name), str(_get_test_reader_name(var.name)))
for op in main_block.ops:
if op.type == root_reader_op.type:
test_op.set_attr("file_names", filelist)
if op.type == "create_multi_pass_reader":
test_op.set_attr("pass_num", 1)
startup_program._sync_with_cpp()
program._sync_with_cpp()
return program
def _load_slice_up_vars(executor, dirname, slice_vars_and_atts): def _load_slice_up_vars(executor, dirname, slice_vars_and_atts):
if slice_vars_and_atts == None or len(slice_vars_and_atts) == 0: if slice_vars_and_atts == None or len(slice_vars_and_atts) == 0:
return return
......
...@@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or ...@@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or
import numpy import numpy
__all__ = [ __all__ = [
'split_lod_tensor',
'merge_lod_tensor',
'While', 'While',
'Switch', 'Switch',
'lod_rank_table',
'max_sequence_len',
'lod_tensor_to_array',
'array_to_lod_tensor',
'increment', 'increment',
'array_write', 'array_write',
'create_array', 'create_array',
'less_than', 'less_than',
'equal', 'equal',
'array_read', 'array_read',
'shrink_memory',
'array_length', 'array_length',
'IfElse', 'IfElse',
'DynamicRNN', 'DynamicRNN',
'ConditionalBlock',
'StaticRNN', 'StaticRNN',
'reorder_lod_tensor_by_rank', 'reorder_lod_tensor_by_rank',
'ParallelDo', 'ParallelDo',
...@@ -1457,7 +1449,7 @@ class IfElse(object): ...@@ -1457,7 +1449,7 @@ class IfElse(object):
if self.status == IfElse.OUT_IF_ELSE_BLOCKS: if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
raise ValueError("input must in true/false blocks") raise ValueError("input must in true/false blocks")
if id(x) not in self.input_table: if id(x) not in self.input_table:
parent_block = self.parent_block() parent_block = self._parent_block()
out_true = parent_block.create_var( out_true = parent_block.create_var(
name=unique_name.generate('ifelse_input' + self.helper.name), name=unique_name.generate('ifelse_input' + self.helper.name),
dtype=x.dtype) dtype=x.dtype)
...@@ -1483,7 +1475,7 @@ class IfElse(object): ...@@ -1483,7 +1475,7 @@ class IfElse(object):
else: else:
return out_false return out_false
def parent_block(self): def _parent_block(self):
current_block = self.helper.main_program.current_block() current_block = self.helper.main_program.current_block()
return self.helper.main_program.block(current_block.parent_idx) return self.helper.main_program.block(current_block.parent_idx)
...@@ -1499,7 +1491,7 @@ class IfElse(object): ...@@ -1499,7 +1491,7 @@ class IfElse(object):
out_table = self.output_table[1 if self.status == out_table = self.output_table[1 if self.status ==
self.IN_IF_ELSE_TRUE_BLOCKS else 0] self.IN_IF_ELSE_TRUE_BLOCKS else 0]
parent_block = self.parent_block() parent_block = self._parent_block()
for each_out in outs: for each_out in outs:
if not isinstance(each_out, Variable): if not isinstance(each_out, Variable):
raise TypeError("Each output should be a variable") raise TypeError("Each output should be a variable")
......
...@@ -62,10 +62,10 @@ def noam_decay(d_model, warmup_steps): ...@@ -62,10 +62,10 @@ def noam_decay(d_model, warmup_steps):
The decayed learning rate. The decayed learning rate.
""" """
global_step = _decay_step_counter(1) global_step = _decay_step_counter(1)
with init_on_cpu():
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
return lr_value return lr_value
...@@ -108,12 +108,10 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -108,12 +108,10 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu(): div_res = global_step / decay_steps
# update learning_rate if staircase:
div_res = global_step / decay_steps div_res = ops.floor(div_res)
if staircase: decayed_lr = learning_rate * (decay_rate**div_res)
div_res = ops.floor(div_res)
decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr return decayed_lr
...@@ -138,11 +136,10 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -138,11 +136,10 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu(): div_res = global_step / decay_steps
div_res = global_step / decay_steps if staircase:
if staircase: div_res = ops.floor(div_res)
div_res = ops.floor(div_res) decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
return decayed_lr return decayed_lr
...@@ -184,12 +181,11 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -184,12 +181,11 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu(): div_res = global_step / decay_steps
div_res = global_step / decay_steps if staircase:
if staircase: div_res = ops.floor(div_res)
div_res = ops.floor(div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res)
return decayed_lr return decayed_lr
...@@ -224,25 +220,22 @@ def polynomial_decay(learning_rate, ...@@ -224,25 +220,22 @@ def polynomial_decay(learning_rate,
""" """
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu(): if cycle:
if cycle: div_res = ops.ceil(global_step / decay_steps)
div_res = ops.ceil(global_step / decay_steps) zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
zero_var = tensor.fill_constant( one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)
shape=[1], dtype='float32', value=0.0)
one_var = tensor.fill_constant(
shape=[1], dtype='float32', value=1.0)
with control_flow.Switch() as switch:
with switch.case(global_step == zero_var):
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res
else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ with control_flow.Switch() as switch:
((1 - global_step / decay_steps) ** power) + end_learning_rate with switch.case(global_step == zero_var):
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res
else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
...@@ -277,28 +270,28 @@ def piecewise_decay(boundaries, values): ...@@ -277,28 +270,28 @@ def piecewise_decay(boundaries, values):
global_step = _decay_step_counter() global_step = _decay_step_counter()
with init_on_cpu(): lr = tensor.create_global_var(
lr = tensor.create_global_var( shape=[1],
shape=[1], value=0.0,
value=0.0, dtype='float32',
dtype='float32', persistable=True,
persistable=True, name="learning_rate")
name="learning_rate")
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1], dtype='float32', value=float(boundaries[i]))
value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], shape=[1],
dtype='float32', dtype='float32',
value=float(values[len(values) - 1])) value=float(boundaries[i]),
with switch.default(): force_cpu=True)
tensor.assign(last_value_var, lr) value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
tensor.assign(last_value_var, lr)
return lr return lr
...@@ -333,9 +326,9 @@ def append_LARS(params_grads, learning_rate, weight_decay): ...@@ -333,9 +326,9 @@ def append_LARS(params_grads, learning_rate, weight_decay):
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
if type(param_lr) == float and param_lr == 1.0: if type(param_lr) == float and param_lr == 1.0:
decayed_lr = learning_rate * param_norm \ decayed_lr = learning_rate * param_norm \
/ _balanced_weight(param_norm, grad_norm) / _balanced_weight(param_norm, grad_norm)
else: else:
decayed_lr = learning_rate * param_lr * param_norm \ decayed_lr = learning_rate * param_lr * param_norm \
/ _balanced_weight(param_norm, grad_norm) / _balanced_weight(param_norm, grad_norm)
# set back param local learning rate # set back param local learning rate
param.optimize_attr['learning_rate'] = decayed_lr param.optimize_attr['learning_rate'] = decayed_lr
...@@ -35,7 +35,7 @@ if len(sys.argv) == 1: ...@@ -35,7 +35,7 @@ if len(sys.argv) == 1:
word_dict = paddle.dataset.imdb.word_dict() word_dict = paddle.dataset.imdb.word_dict()
else: else:
word_dict = load_vocab(sys.argv[1]) word_dict = load_vocab(sys.argv[1])
word_dict["<unk>"] = len(word_dict) word_dict["<unk>"] = len(word_dict)
print "Dict dim = ", len(word_dict) print "Dict dim = ", len(word_dict)
# input text data # input text data
...@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace()) ...@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
BATCH_SIZE = 128 BATCH_SIZE = 128
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=10000), paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
test_reader = paddle.batch( test_reader = paddle.batch(
......
...@@ -19,7 +19,7 @@ import sys ...@@ -19,7 +19,7 @@ import sys
TRAIN_FILES = ['train.recordio'] TRAIN_FILES = ['train.recordio']
TEST_FILES = ['test.recordio'] TEST_FILES = ['test.recordio']
DICT_DIM = 89528 DICT_DIM = 5147
# embedding dim # embedding dim
emb_dim = 128 emb_dim = 128
...@@ -27,58 +27,46 @@ emb_dim = 128 ...@@ -27,58 +27,46 @@ emb_dim = 128
# hidden dim # hidden dim
hid_dim = 128 hid_dim = 128
# hidden dim2
hid_dim2 = 96
# class num # class num
class_dim = 2 class_dim = 2
# epoch num
epoch_num = 10
def network_cfg(is_train, pass_num=100):
with fluid.unique_name.guard():
train_file_obj = fluid.layers.open_files(
filenames=TRAIN_FILES,
pass_num=pass_num,
shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'])
test_file_obj = fluid.layers.open_files(
filenames=TEST_FILES,
pass_num=1,
shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'])
if is_train: def build_program(is_train):
file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000) file_obj_handle = fluid.layers.io.open_files(
else: filenames=TRAIN_FILES if is_train else TEST_FILES,
file_obj = test_file_obj shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'])
file_obj = fluid.layers.double_buffer( file_obj = fluid.layers.io.double_buffer(file_obj_handle)
file_obj,
name="train_double_buffer" if is_train else 'test_double_buffer') with fluid.unique_name.guard():
data, label = fluid.layers.read_file(file_obj) data, label = fluid.layers.read_file(file_obj)
emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim]) emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
# sequence conv with window size = 3
win_size = 3
conv_3 = fluid.nets.sequence_conv_pool( conv_3 = fluid.nets.sequence_conv_pool(
input=emb, input=emb,
num_filters=hid_dim, num_filters=hid_dim,
filter_size=win_size, filter_size=3,
act="tanh", act="tanh",
pool_type="max") pool_type="sqrt")
# fc layer after conv conv_4 = fluid.nets.sequence_conv_pool(
fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) input=emb,
num_filters=hid_dim,
filter_size=4,
act="tanh",
pool_type="sqrt")
# probability of each class prediction = fluid.layers.fc(input=[conv_3, conv_4],
prediction = fluid.layers.fc(input=[fc_1],
size=class_dim, size=class_dim,
act="softmax") act="softmax")
# cross entropy loss # cross entropy loss
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
...@@ -88,58 +76,62 @@ def network_cfg(is_train, pass_num=100): ...@@ -88,58 +76,62 @@ def network_cfg(is_train, pass_num=100):
if is_train: if is_train:
# SGD optimizer # SGD optimizer
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return { return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
'loss': avg_cost,
'log': [avg_cost, acc],
'file': train_file_obj if is_train else test_file_obj
}
def main(): def main():
train = fluid.Program() train = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
test = fluid.Program()
with fluid.program_guard(train, startup): with fluid.program_guard(train, startup):
train_args = network_cfg(is_train=True) train_args = build_program(is_train=True)
test = fluid.Program()
with fluid.program_guard(test, fluid.Program()): with fluid.program_guard(test, startup):
test_args = network_cfg(is_train=False) test_args = build_program(is_train=False)
use_cuda = fluid.core.is_compiled_with_cuda()
# startup # startup
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
exe.run(startup) exe.run(startup)
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=train_args['loss'].name, main_program=train) use_cuda=use_cuda,
loss_name=train_args['loss'].name,
main_program=train)
test_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
fetch_var_list = [var.name for var in train_args['log']] fetch_var_list = [var.name for var in train_args['log']]
for i in xrange(sys.maxint): for epoch_id in range(epoch_num):
result = map(numpy.array, # train
train_exe.run(fetch_list=fetch_var_list try:
if i % 1000 == 0 else [])) batch_id = 0
if len(result) != 0: while True:
print 'Train: ', result loss, acc = map(numpy.array,
train_exe.run(fetch_list=fetch_var_list))
if i % 1000 == 0: print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
test_exe = fluid.ParallelExecutor( batch_id += 1
use_cuda=True, main_program=test, share_vars_from=train_exe) except fluid.core.EOFException:
loss = [] print 'End of epoch', epoch_id
acc = [] train_args['file'].reset()
try:
while True: # test
loss_np, acc_np = map( loss = []
numpy.array, test_exe.run(fetch_list=fetch_var_list)) acc = []
loss.append(loss_np[0]) try:
acc.append(acc_np[0]) while True:
except: loss_np, acc_np = map(numpy.array,
test_args['file'].reset() test_exe.run(fetch_list=fetch_var_list))
print 'TEST: ', numpy.mean(loss), numpy.mean(acc) loss.append(loss_np[0])
acc.append(acc_np[0])
except:
test_args['file'].reset()
print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog): ...@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone() prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name).set_error_clip( prog_clip.block(0).var(hidden1.name)._set_error_clip(
fluid.clip.ErrorClipByValue( fluid.clip.ErrorClipByValue(
max=CLIP_MAX, min=CLIP_MIN)) max=CLIP_MAX, min=CLIP_MIN))
......
...@@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor ...@@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import MomentumOptimizer from paddle.fluid.optimizer import MomentumOptimizer
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.control_flow import split_lod_tensor
from paddle.fluid.layers.control_flow import merge_lod_tensor
from paddle.fluid.layers.control_flow import ConditionalBlock
import unittest import unittest
import numpy as np import numpy as np
...@@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase): ...@@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase):
limit = layers.fill_constant(shape=[1], dtype='int64', value=5) limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
cond = layers.less_than(x=label, y=limit) cond = layers.less_than(x=label, y=limit)
true_image, false_image = layers.split_lod_tensor( true_image, false_image = split_lod_tensor(input=image, mask=cond)
input=image, mask=cond)
true_out = layers.create_tensor(dtype='float32') true_out = layers.create_tensor(dtype='float32')
true_cond = layers.ConditionalBlock([cond]) true_cond = ConditionalBlock([cond])
with true_cond.block(): with true_cond.block():
hidden = layers.fc(input=true_image, size=100, act='tanh') hidden = layers.fc(input=true_image, size=100, act='tanh')
...@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase): ...@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase):
layers.assign(input=prob, output=true_out) layers.assign(input=prob, output=true_out)
false_out = layers.create_tensor(dtype='float32') false_out = layers.create_tensor(dtype='float32')
false_cond = layers.ConditionalBlock([cond]) false_cond = ConditionalBlock([cond])
with false_cond.block(): with false_cond.block():
hidden = layers.fc(input=false_image, size=200, act='tanh') hidden = layers.fc(input=false_image, size=200, act='tanh')
prob = layers.fc(input=hidden, size=10, act='softmax') prob = layers.fc(input=hidden, size=10, act='softmax')
layers.assign(input=prob, output=false_out) layers.assign(input=prob, output=false_out)
prob = layers.merge_lod_tensor( prob = merge_lod_tensor(
in_true=true_out, in_false=false_out, mask=cond, x=image) in_true=true_out, in_false=false_out, mask=cond, x=image)
loss = layers.cross_entropy(input=prob, label=label) loss = layers.cross_entropy(input=prob, label=label)
avg_loss = layers.mean(loss) avg_loss = layers.mean(loss)
......
...@@ -251,7 +251,7 @@ class OpTest(unittest.TestCase): ...@@ -251,7 +251,7 @@ class OpTest(unittest.TestCase):
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
fetch_list.append(str(out_name)) fetch_list.append(str(out_name))
# fetch_list = map(block.var, fetch_list) # fetch_list = map(block.var, fetch_list)
if not isinstance(fetch_list[0], Variable): if not isinstance(fetch_list[0], fluid.framework.Variable):
fetch_list = map(block.var, fetch_list) fetch_list = map(block.var, fetch_list)
outs = executor.run(program, outs = executor.run(program,
feed=feed_map, feed=feed_map,
......
...@@ -18,14 +18,15 @@ import paddle.fluid.core as core ...@@ -18,14 +18,15 @@ import paddle.fluid.core as core
from paddle.fluid.framework import default_startup_program, default_main_program from paddle.fluid.framework import default_startup_program, default_main_program
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import ConditionalBlock
import numpy import numpy
class ConditionalBlock(unittest.TestCase): class ConditionalBlockTest(unittest.TestCase):
def test_forward(self): def test_forward(self):
data = layers.data(name='X', shape=[1], dtype='float32') data = layers.data(name='X', shape=[1], dtype='float32')
data.stop_gradient = False data.stop_gradient = False
cond = layers.ConditionalBlock(inputs=[data]) cond = ConditionalBlock(inputs=[data])
out = layers.create_tensor(dtype='float32') out = layers.create_tensor(dtype='float32')
with cond.block(): with cond.block():
hidden = layers.fc(input=data, size=10) hidden = layers.fc(input=data, size=10)
......
...@@ -16,7 +16,7 @@ import unittest ...@@ -16,7 +16,7 @@ import unittest
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
class ConditionalBlock(unittest.TestCase): class ConstantTest(unittest.TestCase):
def test_const_value(self): def test_const_value(self):
self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD") self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@") self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
......
...@@ -17,6 +17,12 @@ import paddle ...@@ -17,6 +17,12 @@ import paddle
import unittest import unittest
import numpy import numpy
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.layers.control_flow import max_sequence_len
from paddle.fluid.layers.control_flow import lod_tensor_to_array
from paddle.fluid.layers.control_flow import array_to_lod_tensor
from paddle.fluid.layers.control_flow import shrink_memory
class TestDynRNN(unittest.TestCase): class TestDynRNN(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase): ...@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase):
label = fluid.layers.data(name='label', shape=[1], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='float32')
rank_table = fluid.layers.lod_rank_table(x=sent_emb) rank_table = lod_rank_table(x=sent_emb)
sent_emb_array = fluid.layers.lod_tensor_to_array( sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
x=sent_emb, table=rank_table)
seq_len = fluid.layers.max_sequence_len(rank_table=rank_table) seq_len = max_sequence_len(rank_table=rank_table)
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
i.stop_gradient = False i.stop_gradient = False
...@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase):
mem = fluid.layers.array_read(array=mem_array, i=i) mem = fluid.layers.array_read(array=mem_array, i=i)
ipt = fluid.layers.array_read(array=sent_emb_array, i=i) ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) mem = shrink_memory(x=mem, i=i, table=rank_table)
hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
...@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase):
fluid.layers.array_write(x=hidden, i=i, array=mem_array) fluid.layers.array_write(x=hidden, i=i, array=mem_array)
fluid.layers.less_than(x=i, y=seq_len, cond=cond) fluid.layers.less_than(x=i, y=seq_len, cond=cond)
all_timesteps = fluid.layers.array_to_lod_tensor( all_timesteps = array_to_lod_tensor(x=out, table=rank_table)
x=out, table=rank_table)
last = fluid.layers.sequence_last_step(input=all_timesteps) last = fluid.layers.sequence_last_step(input=all_timesteps)
logits = fluid.layers.fc(input=last, size=1, act=None) logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits( loss = fluid.layers.sigmoid_cross_entropy_with_logits(
......
...@@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase): ...@@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase):
def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn, def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
kwargs): kwargs):
main_prog = fluid.Program()
startup_prog = fluid.Program()
decayed_lr = fluid_decay_fn(**kwargs) with fluid.program_guard(main_prog, startup_prog):
decayed_lr = fluid_decay_fn(**kwargs)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(startup_prog)
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(main_prog)
for step in range(10): for step in range(10):
lr_val, = exe.run(fluid.default_main_program(), lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
feed={},
fetch_list=[decayed_lr])
python_decayed_lr = python_decay_fn( python_decayed_lr = python_decay_fn(
global_step=float(step), **kwargs) global_step=float(step), **kwargs)
self.assertAlmostEqual( self.assertAlmostEqual(
......
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle.fluid.layers import lod_rank_table, data from paddle.fluid.layers import data
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
......
...@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard ...@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import lod_rank_table
from paddle.fluid.layers.control_flow import max_sequence_len
from paddle.fluid.layers.control_flow import lod_tensor_to_array
from paddle.fluid.layers.control_flow import array_to_lod_tensor
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
def place(self): def place(self):
...@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): ...@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
with program_guard(program): with program_guard(program):
x = layers.data(name='x', shape=[10]) x = layers.data(name='x', shape=[10])
x.persistable = True x.persistable = True
table = layers.lod_rank_table(x, level=level) table = lod_rank_table(x, level=level)
max_len = layers.max_sequence_len(table) max_len = max_sequence_len(table)
max_len.persistable = True max_len.persistable = True
array = layers.lod_tensor_to_array(x, table) array = lod_tensor_to_array(x, table)
array.persistable = True array.persistable = True
result = layers.array_to_lod_tensor(array, table) result = array_to_lod_tensor(array, table)
result.persistable = True result.persistable = True
exe = Executor(place) exe = Executor(place)
scope = core.Scope() scope = core.Scope()
...@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): ...@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
with program_guard(program): with program_guard(program):
x = layers.data( x = layers.data(
name='x', shape=[1], dtype='float32', stop_gradient=False) name='x', shape=[1], dtype='float32', stop_gradient=False)
table = layers.lod_rank_table(x, level=0) table = lod_rank_table(x, level=0)
array = layers.lod_tensor_to_array(x, table) array = lod_tensor_to_array(x, table)
result = layers.array_to_lod_tensor(array, table) result = array_to_lod_tensor(array, table)
mean = layers.mean(result) mean = layers.mean(result)
......
...@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase):
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
return img, label return img, label
# simple_fc def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence( self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) model, use_cuda=use_cuda, use_reduce=True)
img, label = self._init_data()
self.check_network_convergence( self.check_network_convergence(
simple_fc_net, model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def check_simple_fc_convergence_with_Reduce(self, use_cuda): img, label = self._init_data(random_data)
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, use_reduce=True)
self.check_network_convergence(
simple_fc_net,
use_cuda=use_cuda,
allow_op_delay=True,
use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
simple_fc_net, model,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=False) use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss = self.check_network_convergence(
simple_fc_net, model,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
...@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase):
for loss in zip(all_reduce_first_loss, reduce_first_loss): for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss): for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
img, label = self._init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def test_simple_fc(self): def test_simple_fc(self):
# use_cuda # use_cuda
...@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc_with_new_strategy(self): def test_simple_fc_with_new_strategy(self):
# use_cuda, use_reduce # use_cuda, use_reduce
self.check_simple_fc_convergence_with_Reduce(True) self._compare_reduce_and_allreduce(simple_fc_net, True)
self.check_simple_fc_convergence_with_Reduce(False) self._compare_reduce_and_allreduce(simple_fc_net, False)
def check_simple_fc_parallel_accuracy(self, use_cuda): def check_simple_fc_parallel_accuracy(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
...@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase):
"label": label}, "label": label},
use_cuda=use_cuda) use_cuda=use_cuda)
def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=True)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(True) self.check_batchnorm_fc_convergence(True)
self.check_batchnorm_fc_convergence(False) self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
self.check_batchnorm_fc_convergence_use_reduce(True) self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self.check_batchnorm_fc_convergence_use_reduce(False) self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase):
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
data = next(generator) data = next(generator)
if isinstance(data, fluid.Variable): if isinstance(data, fluid.framework.Variable):
data = [data] data = [data]
with pd.do(): with pd.do():
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.layers.control_flow import lod_rank_table
import numpy import numpy
...@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase):
dat.stop_gradient = False dat.stop_gradient = False
rank_dat = fluid.layers.data( rank_dat = fluid.layers.data(
name=cls.data_desc[1][0], shape=cls.data_desc[1][1]) name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
table = fluid.layers.lod_rank_table(rank_dat) table = lod_rank_table(rank_dat)
new_dat = fluid.layers.reorder_lod_tensor_by_rank( new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table) x=dat, rank_table=table)
loss = fluid.layers.reduce_sum(new_dat) loss = fluid.layers.reduce_sum(new_dat)
......
...@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program ...@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program
from paddle.fluid.framework import Program from paddle.fluid.framework import Program
import numpy as np import numpy as np
from paddle.fluid.layers.control_flow import shrink_memory
from paddle.fluid.layers.control_flow import lod_rank_table
class TestShrinkRNNMemoryBase(unittest.TestCase): class TestShrinkRNNMemoryBase(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): ...@@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
x.stop_gradient = False x.stop_gradient = False
rank_table_tensor = layers.data( rank_table_tensor = layers.data(
'rank_table_tensor', shape=[1], dtype='float32', lod_level=1) 'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
table = layers.lod_rank_table(x=rank_table_tensor) table = lod_rank_table(x=rank_table_tensor)
i = layers.zeros(dtype='int64', shape=[1]) i = layers.zeros(dtype='int64', shape=[1])
self.mem1 = layers.shrink_memory(x=x, i=i, table=table) self.mem1 = shrink_memory(x=x, i=i, table=table)
i = layers.increment(x=i) i = layers.increment(x=i)
i.stop_gradient = True i.stop_gradient = True
self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table) self.mem2 = shrink_memory(x=self.mem1, i=i, table=table)
i = layers.increment(x=i) i = layers.increment(x=i)
i.stop_gradient = True i.stop_gradient = True
self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table) self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
mem3_mean = layers.mean(self.mem3) mem3_mean = layers.mean(self.mem3)
append_backward(loss=mem3_mean) append_backward(loss=mem3_mean)
self.x_grad = self.main_program.global_block().var('x@GRAD') self.x_grad = self.main_program.global_block().var('x@GRAD')
......
...@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers ...@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers
from paddle.fluid.framework import Program, program_guard from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.layers.control_flow import split_lod_tensor
from paddle.fluid.layers.control_flow import merge_lod_tensor
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): ...@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
y = layers.data(name='y', shape=[1]) y = layers.data(name='y', shape=[1])
y.persistable = True y.persistable = True
out_true, out_false = layers.split_lod_tensor( out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
input=x, mask=y, level=level)
out_true.persistable = True out_true.persistable = True
out_false.persistable = True out_false.persistable = True
out = layers.merge_lod_tensor( out = merge_lod_tensor(
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
out.persistable = True out.persistable = True
...@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): ...@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
level = 0 level = 0
out_true, out_false = layers.split_lod_tensor( out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
input=x, mask=y, level=level) out = merge_lod_tensor(
out = layers.merge_lod_tensor(
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
mean = layers.mean(out) mean = layers.mean(out)
......
...@@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher ...@@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, \
Variable, Parameter, grad_var_name Parameter, grad_var_name
from details import * from details import *
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
...@@ -918,7 +918,8 @@ class DistributeTranspiler(object): ...@@ -918,7 +918,8 @@ class DistributeTranspiler(object):
# create table optimize block in pserver program # create table optimize block in pserver program
table_opt_op = [ table_opt_op = [
op for op in self.optimize_ops op for op in self.optimize_ops
if op.input("Param")[0] == self.table_name if 'Param' in op.input_names and op.input("Param")[0] ==
self.table_name
][0] ][0]
table_opt_block = pserver_program.create_block(pre_block_idx) table_opt_block = pserver_program.create_block(pre_block_idx)
# only support sgd now # only support sgd now
...@@ -1075,7 +1076,6 @@ class DistributeTranspiler(object): ...@@ -1075,7 +1076,6 @@ class DistributeTranspiler(object):
] ]
def _clone_var(self, block, var, persistable=True): def _clone_var(self, block, var, persistable=True):
assert isinstance(var, Variable)
return block.create_var( return block.create_var(
name=var.name, name=var.name,
shape=var.shape, shape=var.shape,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from collections import defaultdict from collections import defaultdict
from .. import core from .. import core
from ..framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter
from ..backward import _rename_arg_ from ..backward import _rename_arg_
dtype_to_size = { dtype_to_size = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册