diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 69d665b80fde22c10d9d57687b0e45dae7291969..6efb03dabe89b28f3ff1a55c4a940dfe74e8001d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,8 +1,3 @@ -paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False)) -paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) @@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None) paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) @@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) @@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) -paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) @@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 84f67fafa19ac545ebb7a1019059e3c74c363c56..c2800c972a5501859672fbfd6921499e84d09cb0 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { Executor::Executor(const platform::Place& place) : place_(place) {} +void Executor::Close() { #ifdef PADDLE_WITH_DISTRIBUTE -void Executor::BeginPass() { ::paddle::operators::distributed::RPCClient::GetInstance< ::paddle::operators::distributed::GRPCClient>() - ->SendBeginPass(); -} - -void Executor::EndPass() { - ::paddle::operators::distributed::RPCClient::GetInstance< - ::paddle::operators::distributed::GRPCClient>() - ->SendEndPass(); -} + ->SendComplete(); #endif +} void InitializeVariable(Variable* var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 563a4b2bb65dad481a755f67c7f23939816ce8e8..214ca3dc492c31d4c683790a6ae051be467401c9 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -44,17 +44,11 @@ class Executor { explicit Executor(const platform::Place& place); -#ifdef PADDLE_WITH_DISTRIBUTE /* - * Sending signal to pserver to mark current pass started. + * Close this Executor. + * Calling this method will send complete messages to all pserver instances. */ - void BeginPass(); - - /* - * Sending signal to pserver to mark current pass finished. - */ - void EndPass(); -#endif + void Close(); /* @Brief * Runtime evaluation of the given ProgramDesc under certain Scope diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 3ae255e13fc4f3ca28a6af62a5d5944d84303fc7..58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, executor_->RunPreparedContext( ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(), &feed_targets, &fetch_targets, + false, /* don't create local scope each time*/ false /* don't create variable eatch time */); VLOG(4) << "Finish prepared context"; if (!GetFetch(fetchs, output_data)) { diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index bb603efaf30bb72d74b5583abc45d01a16c076a3..409efac6799b6fb8d27a1343a55e7a508760868f 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, for (int h = 0; h < shape.h(); ++h) { for (int w = 0; w < shape.w(); ++w) { odata[h * ostrides.h() + w * ostrides.w()] = - idata[h * ostrides.h() + w * ostrides.w()]; + idata[h * istrides.h() + w * istrides.w()]; } } } - +// indata c * k // Reorder the data layout from CK to KC. void ReorderCKtoKC(TensorRTEngine::Weight& iweights, TensorRTEngine::Weight* oweights) { @@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter { framework::LoDTensor tmp; tmp.Resize(Y_t->dims()); - memcpy(tmp.mutable_data(platform::CPUPlace()), Y_t->data(), - Y_t->dims()[0] * Y_t->dims()[1]); - + memcpy(tmp.mutable_data(platform::CPUPlace()), weight_data, + Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), Y_t->memory_size() / sizeof(float)}; @@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter { // The data layout of TRT FC layer's weight is different from fluid's FC, // need to reorder the elements. - ReorderCKtoKC(tmp_weight, &weight); + ReorderCKtoKC(weight, &tmp_weight); // Currently, the framework can only handle one fluid op -> one TRT layer, // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just @@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter { auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *const_cast(X), - n_output, weight.get(), bias.get()); + n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); engine_->SetITensor(output_name, layer->getOutput(0)); @@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter { } // namespace paddle REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); -USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 0a02a7bebf9efbd0555707e6cfa701ef1e7d9659..7dabfd9f6a9a8cfbdd1d9a66541180d3499b7bdc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index a30253072ac581ceca85ca10151a176f87a7cb39..081f4d605975f1408d4d8a8ed3108c04d837a4de 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -23,11 +23,11 @@ namespace tensorrt { TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; - TRTConvertValidation validator(20, parameters, scope, 1000); - - validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); - validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); - validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1)); + validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); // Prepare Op description framework::OpDesc desc; @@ -38,9 +38,10 @@ TEST(fc_op, test) { validator.SetOp(*desc.Proto()); - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt } // namespace inference } // namespace paddle +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 1ce1130e5d660d717a1262a1fbdb4b620462c0b3..674f37f2fdddf013a8f6f4671debbc19c3322423 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 3b1f531adc5d756259df1c350f7f44bf71ee1f93..f14885b238134cdf38a278cd8a0734947bcacfe0 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -39,7 +39,7 @@ namespace tensorrt { float random(float low, float high) { static std::random_device rd; static std::mt19937 mt(rd()); - std::uniform_real_distribution dist(1.0, 10.0); + std::uniform_real_distribution dist(low, high); return dist(mt); } @@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, size_t num_elements = analysis::AccuDims(dims, dims.size()); PADDLE_ENFORCE_GT(num_elements, 0); auto* data = tensor->mutable_data(place); + for (size_t i = 0; i < num_elements; i++) { *(data + i) = random(0., 1.); } @@ -68,7 +69,7 @@ class TRTConvertValidation { int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. - engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); + engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_)); engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); @@ -138,12 +139,11 @@ class TRTConvertValidation { cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 200; + const size_t output_space_size = 2000; for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], - output_space_size * sizeof(float)); + engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); cudaStreamSynchronize(*engine_->stream()); auto* var = scope_.FindVar(output); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index fefec0df6d03669a294ce9643b666d7416593708..b821c3d0bf425c46fae634fbf53f7ee63100ca5c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -1,7 +1,7 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -26,6 +26,8 @@ namespace paddle { namespace inference { namespace tensorrt { +int TensorRTEngine::runtime_batch_ = 1; + void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } @@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) { PADDLE_ENFORCE_NOT_NULL(stream_); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); cudaStreamSynchronize(*stream_); + SetRuntimeBatch(batch_size); } TensorRTEngine::~TensorRTEngine() { @@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() { auto dims = infer_engine_->getBindingDimensions(slot_offset); item.second = kDataTypeSize[static_cast( infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims); + analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; PADDLE_ENFORCE_GT(item.second, 0); } auto &buf = buffer(item.first); buf.max_size = item.second * max_batch_; CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size)); - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - // buf.size will changed in the runtime. + + PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); buf.size = 0; + PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G buf.device = DeviceType::GPU; } } @@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, auto *input = infer_network_->addInput(name.c_str(), dtype, dims); PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * - analysis::AccuDims(dims.d, dims.nbDims); + analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; PADDLE_ENFORCE(input->isNetworkInput()); TensorRTEngine::SetITensor(name, input); return input; @@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, size_t max_size) { // determine data size + auto *output = TensorRTEngine::GetITensor(name); + nvinfer1::Dims dims = output->getDimensions(); + auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); + size_t dst_size = dim_size * runtime_batch_ * + kDataTypeSize[static_cast(output->getType())]; + auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_GE(max_size, it->second); + PADDLE_ENFORCE_LE(dst_size, it->second); + PADDLE_ENFORCE_GE(max_size, dst_size); auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, + PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, cudaMemcpyDeviceToDevice, *stream_), 0); } void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, size_t max_size) { - VLOG(4) << "get output in cpu"; - auto &buf = buffer(name); - - // Update needed buffer size. - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - buf.size = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims); - PADDLE_ENFORCE_LE(buf.size, buf.max_size); // determine data size + + auto *output = TensorRTEngine::GetITensor(name); + nvinfer1::Dims dims = output->getDimensions(); + auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); + size_t dst_size = dim_size * runtime_batch_ * + kDataTypeSize[static_cast(output->getType())]; + auto it = buffer_sizes_.find(name); + PADDLE_ENFORCE(it != buffer_sizes_.end()); + PADDLE_ENFORCE_GT(it->second, 0); + PADDLE_ENFORCE_LE(dst_size, it->second); + PADDLE_ENFORCE_GE(max_size, dst_size); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - // DEBUG - memset(dst, 0, buf.size); - PADDLE_ENFORCE_EQ( - 0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost)); + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, + cudaMemcpyDeviceToHost, *stream_)); } Buffer &TensorRTEngine::buffer(const std::string &name) { @@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { return itensor_map_[name]; } +void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { + runtime_batch_ = batch_size; +} + +int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 7064d333f6db754f88c0ac6956a9527a48bf866c..694468c419c20089de1cdecff1a903ad0cc6e99f 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase { nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + void SetRuntimeBatch(size_t batch_size); + int GetRuntimeBatch(); private: // the max batch size int max_batch_; + // the runtime batch size + static int runtime_batch_; // the max memory size the engine uses int max_workspace_; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index fca3488008ed83418b5e28b8af42d8019aaaa2a4..f8732e51b66bdc78aa35d06ba9651f1942a74b01 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(1, 1 << 10, &stream_); + engine_ = new TensorRTEngine(10, 1 << 10, &stream_); engine_->InitNetwork(); } @@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { LOG(INFO) << "to get output"; float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); + engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); LOG(INFO) << "to checkout output"; ASSERT_EQ(y_cpu, x_v * 2 + 3); @@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { LOG(INFO) << "to get output"; float y_cpu[2] = {-1., -1.}; + auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); + engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } +TEST_F(TensorRTEngineTest, test_conv2d_temp) { + // Weight in CPU memory. + float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + float raw_bias[1] = {0}; + + TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); + TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); + auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 3, 3}); + auto* conv_layer = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, + weight.get(), bias.get()); + PADDLE_ENFORCE(conv_layer != nullptr); + conv_layer->setStride(nvinfer1::DimsHW{1, 1}); + conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); + + engine_->DeclareOutput(conv_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), + 18 * sizeof(float)); + engine_->Execute(2); + + LOG(INFO) << "to get output"; + float* y_cpu = new float[18]; + engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.0); + ASSERT_EQ(y_cpu[1], 6.0); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 44c36b1683b037832a218df02184e7cd2ba143e9..695790a37dce889e838462b401ca4e89f09271d5 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -210,13 +210,14 @@ void TestInference(const std::string& dirname, // Ignore the profiling results of the first run std::unique_ptr ctx; + bool CreateLocalScope = CreateVars; if (PrepareContext) { ctx = executor.Prepare(*inference_program, 0); executor.RunPreparedContext(ctx.get(), scope, &feed_targets, - &fetch_targets, true, CreateVars); + &fetch_targets, CreateLocalScope, CreateVars); } else { executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, - true, CreateVars); + CreateLocalScope, CreateVars); } // Enable the profiler @@ -232,10 +233,11 @@ void TestInference(const std::string& dirname, // Note: if you change the inference_program, you need to call // executor.Prepare() again to get a new ExecutorPrepareContext. executor.RunPreparedContext(ctx.get(), scope, &feed_targets, - &fetch_targets, CreateVars); + &fetch_targets, CreateLocalScope, + CreateVars); } else { executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, - CreateVars); + CreateLocalScope, CreateVars); } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 6555b8101a90bba8351d2c82313ab12e572a01ee..1612927055dd4ec5ee2220bc2b285e8d9b640ea8 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -18,7 +18,7 @@ if(WITH_GRPC) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) - cc_test(grpc_server_test SRCS rpc_server_test.cc + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 52c4bc1e7965323438de959d5eb1f3b4ef4f4cfe..265f964ddc682868c64669744b130aebbbf86692 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() { client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); } -void GRPCClient::SendBeginPass() { - for (auto& it : channels_) { - VLOG(3) << "send begin pass to: " << it.first; - this->AsyncSendBeginPass(it.first); - } - this->Wait(); -} - -void GRPCClient::SendEndPass() { - for (auto& it : channels_) { - VLOG(3) << "send end pass to " << it.first; - this->AsyncSendEndPass(it.first); +void GRPCClient::SendComplete() { + std::unique_lock lk(completed_mutex_); + if (!completed_) { + for (auto& it : channels_) { + VLOG(3) << "send complete message to " << it.first; + this->AsyncSendComplete(it.first); + } + PADDLE_ENFORCE(this->Wait(), "internal grpc error"); + completed_ = true; } - this->Wait(); } GRPCClient::~GRPCClient() { @@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, req_count_++; } -void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) { +void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); s->Prepare(time_out); sendrecv::VariableMessage req; - req.set_varname(BEGIN_PASS_MESSAGE); + req.set_varname(COMPLETE_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; } -void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) { - const auto ch = GetChannel(ep); - - FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - s->Prepare(time_out); - - sendrecv::VariableMessage req; - req.set_varname(END_PASS_MESSAGE); - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; -} - void GRPCClient::AsyncCheckpointNotify(const std::string& ep, const std::string& dir, int64_t time_out) { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 11de84d9e265b2ca75d6d72a1d1e8797763f96a5..8351d825f817437e1b3691e916952dd9a86af491 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { class GRPCClient : public RPCClient { public: - GRPCClient() : ok_(true) {} + GRPCClient() : ok_(true), completed_(false) {} virtual ~GRPCClient(); bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, @@ -201,17 +201,12 @@ class GRPCClient : public RPCClient { void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendBeginPass(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendEndPass(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + void AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) override; bool Wait() override; - void SendBeginPass() override; - - void SendEndPass() override; + void SendComplete() override; protected: void InitImpl() override; @@ -238,6 +233,10 @@ class GRPCClient : public RPCClient { // mutex for GetChannel thread safety std::mutex chan_mutex_; DISABLE_COPY_AND_ASSIGN(GRPCClient); + + // mutex for sending complete message only once + std::mutex completed_mutex_; + bool completed_; }; } // namespace distributed diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3d61171dff98d6752be98b4b90577bfd059525ab..64ac7281848f91302bc0aa3cb81dd198e56fb653 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV" -#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV" -#define END_PASS_MESSAGE "END_PASS@RECV" #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index f1f84072d47e58eaa81dd66dc018e17b182bb57b..55995783c6eab10632ab2a5bca64ca856f000df1 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname, if (varname == BATCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; rpc_server_->IncreaseBatchBarrier(kRequestSend); - } else if (varname == BEGIN_PASS_MESSAGE) { - VLOG(3) << "sync: recv begin pass message"; - rpc_server_->WaitCond(kRequestSend); - rpc_server_->BeginPass(); + } else if (varname == COMPLETE_MESSAGE) { + VLOG(3) << "sync: recv complete message"; + rpc_server_->Complete(); } else { VLOG(3) << "sync: received var_name: " << varname; rpc_server_->WaitCond(kRequestSend); @@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname, if (varname == FETCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv fetch barrier message"; rpc_server_->IncreaseBatchBarrier(kRequestGet); - } else if (varname == END_PASS_MESSAGE) { - rpc_server_->EndPass(); } else { rpc_server_->WaitCond(kRequestGet); *outvar = scope_->FindVar(varname); } } else { - if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) { + if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { *outvar = scope_->FindVar(varname); } } diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 4d87376fbf776e29156b78d826f5012bc53460df..22a022a5d25e5c6628b80294494b87ca105a04c7 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -60,17 +60,13 @@ class RPCClient { const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) = 0; - virtual void AsyncSendBeginPass(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual void AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) = 0; - virtual void AsyncSendEndPass(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - // BeginePass/EndPass tells all the pserver that start/end a pass, so that - // the pserver can increase/reduce it's barrier count, and continue to train + // Complete tells all the pserver instances that finishe the training, + // the pserver can reduce it's barrier count, and continue to train // with other trainers. - virtual void SendBeginPass() = 0; - virtual void SendEndPass() = 0; + virtual void SendComplete() = 0; virtual bool Wait() = 0; diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a..83b14fa64d735d80f43bf55c798cddb2f3ea7032 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { } } -void RPCServer::BeginPass() { - VLOG(4) << "RPCServer begin increase pass barrier"; - { - std::unique_lock lock(mutex_); - client_num_++; - VLOG(4) << "increase client_num to: " << client_num_; - } - barrier_cond_.notify_all(); -} - -void RPCServer::EndPass() { - VLOG(4) << "RPCServer begin increase pass barrier"; +void RPCServer::Complete() { { std::unique_lock lock(mutex_); client_num_--; @@ -87,6 +76,11 @@ void RPCServer::EndPass() { barrier_cond_.notify_all(); } +int RPCServer::GetClientNum() { + std::unique_lock lock(mutex_); + return client_num_; +} + void RPCServer::ResetBarrierCounter() { VLOG(3) << "RPCServer ResetBarrierCounter "; std::unique_lock lock(mutex_); diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index 833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf..fd914d7a72e61bc9472876c433b65598ef5b1980 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -44,7 +44,7 @@ class RPCServer { int GetSelectedPort() const { return selected_port_; } - int GetClientNum() const; + int GetClientNum(); void SavePort() const; @@ -64,8 +64,7 @@ class RPCServer { void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); - void BeginPass(); - void EndPass(); + void Complete(); void ResetBarrierCounter(); diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index a0693cffabcc561b0adfafc2c49027a890dd5efc..9f2360ec70d2ce5d4e16435595e109c1bf04fd13 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, } } -void StartServer() { +void StartServer(const std::string& rpc_name) { framework::ProgramDesc program; framework::Scope scope; platform::CPUPlace place; @@ -107,14 +107,14 @@ void StartServer() { std::shared_ptr> prefetch_var_name_to_prepared; prefetch_var_name_to_prepared[in_var_name] = prepared[0]; + g_req_handler->SetProgram(&program); g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetDevCtx(&ctx); g_req_handler->SetScope(&scope); g_req_handler->SetExecutor(&exe); - g_rpc_service->RegisterRPC(distributed::kRequestPrefetch, - g_req_handler.get()); + g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( @@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) { distributed::RPCClient* client = distributed::RPCClient::GetInstance(); - std::thread server_thread(StartServer); + std::thread server_thread(StartServer, distributed::kRequestPrefetch); g_rpc_service->WaitServerReady(); int port = g_rpc_service->GetSelectedPort(); @@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) { g_rpc_service.reset(nullptr); g_req_handler.reset(nullptr); } + +TEST(COMPLETE, CPU) { + g_req_handler.reset(new distributed::RequestSendHandler(true)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); + PADDLE_ENFORCE(client != nullptr); + std::thread server_thread(StartServer, distributed::kRequestSend); + g_rpc_service->WaitServerReady(); + int port = g_rpc_service->GetSelectedPort(); + std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); + client->AsyncSendComplete(ep); + client->Wait(); + + EXPECT_EQ(g_rpc_service->GetClientNum(), 1); + + g_rpc_service->ShutDown(); + server_thread.join(); + g_rpc_service.reset(nullptr); + g_req_handler.reset(nullptr); +} diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 311a429f9c307f3913a1ffe5dfb7d84119c9711e..4f7cfc24ec035349f3c85e84d876ad9b5b5493a6 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -38,12 +38,10 @@ class LoDTensorBlockingQueue { public: bool Push(const std::vector& lod_tensor_vec) { - CheckDims(lod_tensor_vec); return queue_.Send(lod_tensor_vec); } bool Push(std::vector&& lod_tensor_vec) { - CheckDims(lod_tensor_vec); return queue_.Send(std::move(lod_tensor_vec)); } @@ -65,21 +63,6 @@ class LoDTensorBlockingQueue { inline bool IsClosed() const { return queue_.IsClosed(); } private: - void CheckDims( - const std::vector& lod_tensor_vec) const { - PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(), - "Expect input size is %d but found %s", dims_.size(), - lod_tensor_vec.size()); - for (size_t i = 0; i < dims_.size(); ++i) { - const auto& in_dims = framework::slice_ddim( - lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size()); - const auto& expect_dims = - framework::slice_ddim(dims_[i], 1, dims_[i].size()); - PADDLE_ENFORCE(in_dims == expect_dims, - "Dims of the %d-th input tensor do not match", i); - } - } - BlockingQueue> queue_; std::vector dims_; }; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 918f3be533d51367eade5f5108ad2eab954a9303..a9fd1869c9df5464db6fc87ac633cdba2d6dbe7f 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -216,7 +216,7 @@ class ReshapeKernel { if (shape_tensor) { auto *shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; - if (platform::is_gpu_place(ctx.GetPlace())) { + if (platform::is_gpu_place(shape_tensor->place())) { TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 43672d6db92a981f0fbe6e8f7079dafc6ae4052e..db641a4bc2c637e0babee6b6bc6e67b068759ff5 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); + switch (shape.size()) { case 2: - return nvinfer1::Dims2(shape[0], shape[1]); + return nvinfer1::Dims2(1, shape[1]); case 3: - return nvinfer1::Dims3(shape[0], shape[1], shape[2]); + return nvinfer1::Dims3(1, shape[1], shape[2]); case 4: - return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); + return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]); default: return nvinfer1::Dims(); } diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index a332d70030ffa6a033f6b2b33487a4fd279b7016..32d10fd8a5687ebaae1d7d75af531cbc45ef4245 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* fluid_v = context.scope().FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto* fluid_t = fluid_v->GetMutable(); - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); + fluid_t->Resize(framework::make_ddim(ddim)); // TODO(Superjomn) find some way to determine which device to output the // tensor. // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. + auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * + FLAGS_tensorrt_engine_batch_size; engine->GetOutputInCPU(y, fluid_t->mutable_data(platform::CPUPlace()), size * sizeof(float)); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 82a16361e40513aeaf6f510e450f58989369fcdb..7cb1e47a1516c32fb31a7818e7203b498e31e431 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) { LOG(INFO) << "create block desc"; framework::BlockDesc block_desc(&program, block_); - LOG(INFO) << "create mul op"; - auto* mul = block_desc.AppendOp(); - mul->SetType("mul"); - mul->SetInput("X", std::vector({"x"})); // 2 x 4 - mul->SetInput("Y", std::vector({"y"})); // 4 x 6 - mul->SetOutput("Out", std::vector({"z"})); // 2 x 6 + LOG(INFO) << "create fc op"; + auto* fc0 = block_desc.AppendOp(); + fc0->SetType("fc"); + fc0->SetInput("X", std::vector({"x"})); // 4 x 1 x 1 + fc0->SetInput("Y", std::vector({"y"})); // 4 x 6 + fc0->SetOutput("Out", std::vector({"z"})); // 6 x 1 x 1 LOG(INFO) << "create fc op"; - auto* fc = block_desc.AppendOp(); - fc->SetType("mul"); - fc->SetInput("X", std::vector({"z"})); - fc->SetInput("Y", std::vector({"y0"})); // 6 x 8 - fc->SetOutput("Out", std::vector({"z0"})); // 2 x 8 + auto* fc1 = block_desc.AppendOp(); + fc1->SetType("fc"); + fc1->SetInput("X", std::vector({"z"})); + fc1->SetInput("Y", std::vector({"y0"})); // 6 x 8 + fc1->SetOutput("Out", std::vector({"z0"})); // 8 x 1 x 1 // Set inputs' variable shape in BlockDesc - AddTensorToBlockDesc(block_, "x", std::vector({2, 4})); + // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1} + AddTensorToBlockDesc(block_, "x", std::vector({2, 4, 1, 1})); AddTensorToBlockDesc(block_, "y", std::vector({4, 6})); AddTensorToBlockDesc(block_, "y0", std::vector({6, 8})); AddTensorToBlockDesc(block_, "z", std::vector({2, 6})); // It is wired, need to copy manually. - *block_->add_ops() = *mul->Proto(); - *block_->add_ops() = *fc->Proto(); + *block_->add_ops() = *fc0->Proto(); + *block_->add_ops() = *fc1->Proto(); ASSERT_EQ(block_->ops_size(), 2); LOG(INFO) << "create tensorrt desc"; framework::OpDesc engine_op_desc(nullptr); engine_op_desc.SetType("tensorrt_engine"); - engine_op_desc.SetInput("Xs", std::vector({"x", "y", "y0"})); + engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); @@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } } // namespace operators } // namespace paddle -USE_TRT_CONVERTER(mul) USE_TRT_CONVERTER(fc) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3e13e7b1ffebf92301df69084b058ca55783e578..ee1c8d46ddfb4f0c09591bb78dc720555dc735b4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) -#ifdef PADDLE_WITH_DISTRIBUTE - .def("begin_pass", &Executor::BeginPass) - .def("end_pass", &Executor::EndPass) -#endif + .def("close", &Executor::Close) .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, int block_id, bool create_local_scope, bool create_vars) { pybind11::gil_scoped_release release; diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f9e600cb4cb252baead87025db0e0db71e8169d2..4178971398c953236bf8de4d5cb6e93d0e33380c 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -247,6 +247,7 @@ class Executor(object): p.set_place(place) self.executor = core.Executor(p) self.program_caches = dict() + self._closed = False def as_lodtensor(self, data): """ @@ -348,11 +349,23 @@ class Executor(object): ] return outs - def begin_pass(self): - self.executor.begin_pass() + def close(self): + """ + Close this executor. - def end_pass(self): - self.executor.end_pass() + You can no long use this executor after calling this method. + For the distributed training, this method would free the resource on PServers related to + the current Trainer. + + Example: + >>> cpu = core.CPUPlace() + >>> exe = Executor(cpu) + >>> ... + >>> exe.close() + """ + if not self._closed: + self.executor.close() + self._closed = True def run(self, program=None, @@ -405,6 +418,10 @@ class Executor(object): >>> feed={'X': x}, >>> fetch_list=[loss.name]) """ + + if self._closed: + raise RuntimeError("Attempted to use a closed Executor") + if feed is None: feed = {} if not isinstance(feed, dict): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1a61fce37fa76fe0d62ce01caa45511bd0a5c0bf..9fc3849ee071dba0c4d39e2d661f5afd064d1210 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -32,7 +32,6 @@ except Exception, e: import unique_name __all__ = [ - 'Variable', 'Program', 'Operator', 'Parameter', @@ -302,7 +301,7 @@ class Variable(object): __repr__ = __str__ - def set_desc(self, input): + def _set_desc(self, input): """ Set the variable description. @@ -347,7 +346,7 @@ class Variable(object): def type(self): return self.desc.type() - def set_error_clip(self, error_clip): + def _set_error_clip(self, error_clip): """ Set the error_clip. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e5eb34eb0f5cfceeaa7ac09d5bd3d9dc60b7692f..2d03e189b968cf55a7006b8ebcff4d9c3e30a232 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -797,104 +797,6 @@ def get_parameter_value_by_name(name, executor, program=None): return get_parameter_value(var, executor) -def get_test_program(filelist, program=None, startup_program=None): - """ - Transpile current train program to a program to read test dataset - if the program is using reader ops like "open_files_op". - """ - - def _copy_reader_var_(block, var, new_name=None): - if new_name == None: - new_name = var.name - new_var = block.create_var( - name=str(new_name), type=core.VarDesc.VarType.READER) - new_var.desc.set_shapes(var.desc.shapes()) - new_var.desc.set_dtypes(var.desc.dtypes()) - new_var.persistable = True - return new_var - - def _get_test_reader_name(train_reader_name): - return train_reader_name + "_test" - - def _is_reader_op(op): - block = op.block - if "Out" in op.output_names: - reader_out = block.vars[op.output("Out")[0]] - if reader_out.type == core.VarDesc.VarType.READER: - return True - return False - - if program == None: - program = default_main_program() - if startup_program == None: - startup_program = default_startup_program() - startup_block = startup_program.global_block() - - # 1. find out the orignal reader var name - startup_reader_op_list = [] - - for op in startup_block.ops: - if _is_reader_op(op): - startup_reader_op_list.append(op) - - if len(startup_reader_op_list) == 0: - return program - - root_reader_op = startup_reader_op_list[0] - train_test_reader_map = {} - # 2. add operators to startup to read open and read test data files - for op in startup_reader_op_list: - assert (len(op.output("Out")) == 1) - train_reader_name = op.output("Out")[0] - train_reader = startup_block.vars[train_reader_name] - test_reader = _copy_reader_var_( - startup_block, - train_reader, - new_name=_get_test_reader_name(train_reader_name)) - train_test_reader_map[train_reader.name] = test_reader - - test_op_inputs = {} - for name in op.input_names: - train_arg_names = op.input(name) - test_arg_vars = [] - for arg_name in train_arg_names: - arg_var = train_test_reader_map[ - arg_name] if name == "UnderlyingReader" else startup_block.vars[ - arg_name] - test_arg_vars.append(arg_var) - test_op_inputs[name] = test_arg_vars - - test_op = startup_block.append_op( - type=op.type, - inputs=test_op_inputs, - outputs={'Out': [test_reader]}, - attrs=op.attrs) - # root reader op's filelist attr for read test files - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - # 3. rename reader vars in inference program to different name - # to avoid read from train data. - main_block = program.global_block() - for var in main_block.vars.values(): - if var.type == core.VarDesc.VarType.READER: - main_block._rename_var( - str(var.name), str(_get_test_reader_name(var.name))) - - for op in main_block.ops: - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - startup_program._sync_with_cpp() - program._sync_with_cpp() - - return program - - def _load_slice_up_vars(executor, dirname, slice_vars_and_atts): if slice_vars_and_atts == None or len(slice_vars_and_atts) == 0: return diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 79a11a30d602cb33c2583873e0d0f2e15e0fcb8c..f05ae6d5d1900560e37370121bf64f1fcab14357 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or import numpy __all__ = [ - 'split_lod_tensor', - 'merge_lod_tensor', 'While', 'Switch', - 'lod_rank_table', - 'max_sequence_len', - 'lod_tensor_to_array', - 'array_to_lod_tensor', 'increment', 'array_write', 'create_array', 'less_than', 'equal', 'array_read', - 'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', - 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank', 'ParallelDo', @@ -1457,7 +1449,7 @@ class IfElse(object): if self.status == IfElse.OUT_IF_ELSE_BLOCKS: raise ValueError("input must in true/false blocks") if id(x) not in self.input_table: - parent_block = self.parent_block() + parent_block = self._parent_block() out_true = parent_block.create_var( name=unique_name.generate('ifelse_input' + self.helper.name), dtype=x.dtype) @@ -1483,7 +1475,7 @@ class IfElse(object): else: return out_false - def parent_block(self): + def _parent_block(self): current_block = self.helper.main_program.current_block() return self.helper.main_program.block(current_block.parent_idx) @@ -1499,7 +1491,7 @@ class IfElse(object): out_table = self.output_table[1 if self.status == self.IN_IF_ELSE_TRUE_BLOCKS else 0] - parent_block = self.parent_block() + parent_block = self._parent_block() for each_out in outs: if not isinstance(each_out, Variable): raise TypeError("Each output should be a variable") diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 6071e3e74218e4db4cddc223818d3a9b7086fd86..c7966e36f15ef0e3f30f8a96ad71df04aece0fa1 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -62,10 +62,10 @@ def noam_decay(d_model, warmup_steps): The decayed learning rate. """ global_step = _decay_step_counter(1) - with init_on_cpu(): - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) + + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) return lr_value @@ -108,12 +108,10 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ global_step = _decay_step_counter() - with init_on_cpu(): - # update learning_rate - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) return decayed_lr @@ -138,11 +136,10 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ global_step = _decay_step_counter() - with init_on_cpu(): - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) return decayed_lr @@ -184,12 +181,11 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ global_step = _decay_step_counter() - with init_on_cpu(): - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) - decayed_lr = learning_rate / (1 + decay_rate * div_res) + decayed_lr = learning_rate / (1 + decay_rate * div_res) return decayed_lr @@ -224,25 +220,22 @@ def polynomial_decay(learning_rate, """ global_step = _decay_step_counter() - with init_on_cpu(): - if cycle: - div_res = ops.ceil(global_step / decay_steps) - zero_var = tensor.fill_constant( - shape=[1], dtype='float32', value=0.0) - one_var = tensor.fill_constant( - shape=[1], dtype='float32', value=1.0) - - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - tensor.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res - else: - decay_steps_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) + if cycle: + div_res = ops.ceil(global_step / decay_steps) + zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0) + one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0) - decayed_lr = (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate + with control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + tensor.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) + + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate return decayed_lr @@ -277,28 +270,28 @@ def piecewise_decay(boundaries, values): global_step = _decay_step_counter() - with init_on_cpu(): - lr = tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") + lr = tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") - with control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = tensor.fill_constant( - shape=[1], dtype='float32', value=float(boundaries[i])) - value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(global_step < boundary_val): - tensor.assign(value_var, lr) - last_value_var = tensor.fill_constant( + with control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = tensor.fill_constant( shape=[1], dtype='float32', - value=float(values[len(values) - 1])) - with switch.default(): - tensor.assign(last_value_var, lr) + value=float(boundaries[i]), + force_cpu=True) + value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(global_step < boundary_val): + tensor.assign(value_var, lr) + last_value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[len(values) - 1])) + with switch.default(): + tensor.assign(last_value_var, lr) return lr @@ -333,9 +326,9 @@ def append_LARS(params_grads, learning_rate, weight_decay): grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) if type(param_lr) == float and param_lr == 1.0: decayed_lr = learning_rate * param_norm \ - / _balanced_weight(param_norm, grad_norm) + / _balanced_weight(param_norm, grad_norm) else: decayed_lr = learning_rate * param_lr * param_norm \ - / _balanced_weight(param_norm, grad_norm) + / _balanced_weight(param_norm, grad_norm) # set back param local learning rate param.optimize_attr['learning_rate'] = decayed_lr diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore similarity index 100% rename from python/paddle/fluid/tests/demo/text_classification/.gitignore rename to python/paddle/fluid/tests/demo/file_reader/.gitignore diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py similarity index 94% rename from python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py rename to python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py index 8244617711138d590193b2898de5d2f3aeb1e11e..b839e14889884bca8d27586aa8c1d76fba3458c1 100644 --- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py +++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py @@ -35,7 +35,7 @@ if len(sys.argv) == 1: word_dict = paddle.dataset.imdb.word_dict() else: word_dict = load_vocab(sys.argv[1]) -word_dict[""] = len(word_dict) + word_dict[""] = len(word_dict) print "Dict dim = ", len(word_dict) # input text data @@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace()) BATCH_SIZE = 128 train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=10000), + paddle.dataset.imdb.train(word_dict), buf_size=25000), batch_size=BATCH_SIZE) test_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/demo/file_reader/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3a6dc81d24afec66ed1489aead1cff79a59bca --- /dev/null +++ b/python/paddle/fluid/tests/demo/file_reader/train.py @@ -0,0 +1,138 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import numpy +import sys + +TRAIN_FILES = ['train.recordio'] +TEST_FILES = ['test.recordio'] + +DICT_DIM = 5147 + +# embedding dim +emb_dim = 128 + +# hidden dim +hid_dim = 128 + +# class num +class_dim = 2 + +# epoch num +epoch_num = 10 + + +def build_program(is_train): + file_obj_handle = fluid.layers.io.open_files( + filenames=TRAIN_FILES if is_train else TEST_FILES, + shapes=[[-1, 1], [-1, 1]], + lod_levels=[1, 0], + dtypes=['int64', 'int64']) + + file_obj = fluid.layers.io.double_buffer(file_obj_handle) + + with fluid.unique_name.guard(): + + data, label = fluid.layers.read_file(file_obj) + + emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim]) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + + prediction = fluid.layers.fc(input=[conv_3, conv_4], + size=class_dim, + act="softmax") + + # cross entropy loss + cost = fluid.layers.cross_entropy(input=prediction, label=label) + + # mean loss + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + if is_train: + # SGD optimizer + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle} + + +def main(): + train = fluid.Program() + startup = fluid.Program() + test = fluid.Program() + + with fluid.program_guard(train, startup): + train_args = build_program(is_train=True) + + with fluid.program_guard(test, startup): + test_args = build_program(is_train=False) + + use_cuda = fluid.core.is_compiled_with_cuda() + # startup + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place=place) + exe.run(startup) + + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=train_args['loss'].name, + main_program=train) + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, main_program=test, share_vars_from=train_exe) + + fetch_var_list = [var.name for var in train_args['log']] + for epoch_id in range(epoch_num): + # train + try: + batch_id = 0 + while True: + loss, acc = map(numpy.array, + train_exe.run(fetch_list=fetch_var_list)) + print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc + batch_id += 1 + except fluid.core.EOFException: + print 'End of epoch', epoch_id + train_args['file'].reset() + + # test + loss = [] + acc = [] + try: + while True: + loss_np, acc_np = map(numpy.array, + test_exe.run(fetch_list=fetch_var_list)) + loss.append(loss_np[0]) + acc.append(acc_np[0]) + except: + test_args['file'].reset() + print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc) + + +if __name__ == '__main__': + main() diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py deleted file mode 100644 index 281c2869d642c7fe41a386c42208ca2da1dc2891..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/demo/text_classification/train.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -import numpy -import sys - -TRAIN_FILES = ['train.recordio'] -TEST_FILES = ['test.recordio'] - -DICT_DIM = 89528 - -# embedding dim -emb_dim = 128 - -# hidden dim -hid_dim = 128 - -# hidden dim2 -hid_dim2 = 96 - -# class num -class_dim = 2 - - -def network_cfg(is_train, pass_num=100): - with fluid.unique_name.guard(): - train_file_obj = fluid.layers.open_files( - filenames=TRAIN_FILES, - pass_num=pass_num, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64']) - - test_file_obj = fluid.layers.open_files( - filenames=TEST_FILES, - pass_num=1, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64']) - - if is_train: - file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000) - else: - file_obj = test_file_obj - - file_obj = fluid.layers.double_buffer( - file_obj, - name="train_double_buffer" if is_train else 'test_double_buffer') - - data, label = fluid.layers.read_file(file_obj) - - emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim]) - - # sequence conv with window size = 3 - win_size = 3 - conv_3 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=win_size, - act="tanh", - pool_type="max") - - # fc layer after conv - fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) - - # probability of each class - prediction = fluid.layers.fc(input=[fc_1], - size=class_dim, - act="softmax") - # cross entropy loss - cost = fluid.layers.cross_entropy(input=prediction, label=label) - - # mean loss - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - if is_train: - # SGD optimizer - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) - sgd_optimizer.minimize(avg_cost) - - return { - 'loss': avg_cost, - 'log': [avg_cost, acc], - 'file': train_file_obj if is_train else test_file_obj - } - - -def main(): - train = fluid.Program() - startup = fluid.Program() - - with fluid.program_guard(train, startup): - train_args = network_cfg(is_train=True) - - test = fluid.Program() - - with fluid.program_guard(test, fluid.Program()): - test_args = network_cfg(is_train=False) - - # startup - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place=place) - exe.run(startup) - - train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=train_args['loss'].name, main_program=train) - - fetch_var_list = [var.name for var in train_args['log']] - for i in xrange(sys.maxint): - result = map(numpy.array, - train_exe.run(fetch_list=fetch_var_list - if i % 1000 == 0 else [])) - if len(result) != 0: - print 'Train: ', result - - if i % 1000 == 0: - test_exe = fluid.ParallelExecutor( - use_cuda=True, main_program=test, share_vars_from=train_exe) - loss = [] - acc = [] - try: - while True: - loss_np, acc_np = map( - numpy.array, test_exe.run(fetch_list=fetch_var_list)) - loss.append(loss_np[0]) - acc.append(acc_np[0]) - except: - test_args['file'].reset() - print 'TEST: ', numpy.mean(loss), numpy.mean(acc) - - -if __name__ == '__main__': - main() diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py index 89f4c64975802dc1827ec17ed3626b91e36d6971..3dc858971c584cca947cd958680dbdcf25df9e99 100644 --- a/python/paddle/fluid/tests/test_error_clip.py +++ b/python/paddle/fluid/tests/test_error_clip.py @@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog): avg_cost = fluid.layers.mean(cost) prog_clip = prog.clone() -prog_clip.block(0).var(hidden1.name).set_error_clip( +prog_clip.block(0).var(hidden1.name)._set_error_clip( fluid.clip.ErrorClipByValue( max=CLIP_MAX, min=CLIP_MIN)) diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py index 1b58925599de62510ea9048f5210bb0b7e49f933..799c31dfe5161ff6aef47601f1b6f6e38885760b 100644 --- a/python/paddle/fluid/tests/test_if_else_op.py +++ b/python/paddle/fluid/tests/test_if_else_op.py @@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor from paddle.fluid.optimizer import MomentumOptimizer import paddle.fluid.core as core import paddle.fluid as fluid +from paddle.fluid.layers.control_flow import split_lod_tensor +from paddle.fluid.layers.control_flow import merge_lod_tensor +from paddle.fluid.layers.control_flow import ConditionalBlock + import unittest import numpy as np @@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase): limit = layers.fill_constant(shape=[1], dtype='int64', value=5) cond = layers.less_than(x=label, y=limit) - true_image, false_image = layers.split_lod_tensor( - input=image, mask=cond) + true_image, false_image = split_lod_tensor(input=image, mask=cond) true_out = layers.create_tensor(dtype='float32') - true_cond = layers.ConditionalBlock([cond]) + true_cond = ConditionalBlock([cond]) with true_cond.block(): hidden = layers.fc(input=true_image, size=100, act='tanh') @@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase): layers.assign(input=prob, output=true_out) false_out = layers.create_tensor(dtype='float32') - false_cond = layers.ConditionalBlock([cond]) + false_cond = ConditionalBlock([cond]) with false_cond.block(): hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=false_out) - prob = layers.merge_lod_tensor( + prob = merge_lod_tensor( in_true=true_out, in_false=false_out, mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) avg_loss = layers.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 6824ede82b74c4e9783682149db870a471c35079..82b5e7cf0b3633eb04ab97c5300b1926b9d47cb6 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -251,7 +251,7 @@ class OpTest(unittest.TestCase): for out_name, out_dup in Operator.get_op_outputs(self.op_type): fetch_list.append(str(out_name)) # fetch_list = map(block.var, fetch_list) - if not isinstance(fetch_list[0], Variable): + if not isinstance(fetch_list[0], fluid.framework.Variable): fetch_list = map(block.var, fetch_list) outs = executor.run(program, feed=feed_map, diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py index 084b8d37386fac0366c190f5f30dd39467072498..d9f83905e6135e22f74e749857f9b0fbe464d3f4 100644 --- a/python/paddle/fluid/tests/unittests/test_conditional_block.py +++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py @@ -18,14 +18,15 @@ import paddle.fluid.core as core from paddle.fluid.framework import default_startup_program, default_main_program from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import ConditionalBlock import numpy -class ConditionalBlock(unittest.TestCase): +class ConditionalBlockTest(unittest.TestCase): def test_forward(self): data = layers.data(name='X', shape=[1], dtype='float32') data.stop_gradient = False - cond = layers.ConditionalBlock(inputs=[data]) + cond = ConditionalBlock(inputs=[data]) out = layers.create_tensor(dtype='float32') with cond.block(): hidden = layers.fc(input=data, size=10) diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py index d1075d514e9b2b692f271f10a005815a66b421fb..58ac6fa0a9a30a08a831111513777cca59062724 100644 --- a/python/paddle/fluid/tests/unittests/test_const_value.py +++ b/python/paddle/fluid/tests/unittests/test_const_value.py @@ -16,7 +16,7 @@ import unittest import paddle.fluid.framework as framework -class ConditionalBlock(unittest.TestCase): +class ConstantTest(unittest.TestCase): def test_const_value(self): self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD") self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@") diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index 0faed94deb4808783027d776e0f4c61da0db457a..4448de8839d7ad4ad1f70ecdc4ac94da1e619adb 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -17,6 +17,12 @@ import paddle import unittest import numpy +from paddle.fluid.layers.control_flow import lod_rank_table +from paddle.fluid.layers.control_flow import max_sequence_len +from paddle.fluid.layers.control_flow import lod_tensor_to_array +from paddle.fluid.layers.control_flow import array_to_lod_tensor +from paddle.fluid.layers.control_flow import shrink_memory + class TestDynRNN(unittest.TestCase): def setUp(self): @@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase): label = fluid.layers.data(name='label', shape=[1], dtype='float32') - rank_table = fluid.layers.lod_rank_table(x=sent_emb) + rank_table = lod_rank_table(x=sent_emb) - sent_emb_array = fluid.layers.lod_tensor_to_array( - x=sent_emb, table=rank_table) + sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table) - seq_len = fluid.layers.max_sequence_len(rank_table=rank_table) + seq_len = max_sequence_len(rank_table=rank_table) i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) i.stop_gradient = False @@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase): mem = fluid.layers.array_read(array=mem_array, i=i) ipt = fluid.layers.array_read(array=sent_emb_array, i=i) - mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) + mem = shrink_memory(x=mem, i=i, table=rank_table) hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') @@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase): fluid.layers.array_write(x=hidden, i=i, array=mem_array) fluid.layers.less_than(x=i, y=seq_len, cond=cond) - all_timesteps = fluid.layers.array_to_lod_tensor( - x=out, table=rank_table) + all_timesteps = array_to_lod_tensor(x=out, table=rank_table) last = fluid.layers.sequence_last_step(input=all_timesteps) logits = fluid.layers.fc(input=last, size=1, act=None) loss = fluid.layers.sigmoid_cross_entropy_with_logits( diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 6382e290eb30c621da64d5c600be6d8a7c6254f1..e628195e7265ec564bd64a212c4a35fdff495063 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase): def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn, kwargs): + main_prog = fluid.Program() + startup_prog = fluid.Program() - decayed_lr = fluid_decay_fn(**kwargs) + with fluid.program_guard(main_prog, startup_prog): + decayed_lr = fluid_decay_fn(**kwargs) place = fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + exe.run(startup_prog) - fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(main_prog) for step in range(10): - lr_val, = exe.run(fluid.default_main_program(), - feed={}, - fetch_list=[decayed_lr]) + lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr]) python_decayed_lr = python_decay_fn( global_step=float(step), **kwargs) self.assertAlmostEqual( diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py index bac5e502318397b43e9867d5fc9e4e8cd33394b8..16e85830ffa51ec428951570cc7a038f3d10c873 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py +++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.layers import lod_rank_table, data +from paddle.fluid.layers import data +from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.executor import Executor import paddle.fluid.core as core import numpy diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py index cebe6997bb4152519dabbabfc0404d6036bc4e65..5a4580116bc7009c73f1de14a265bf2cea5acf9b 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py @@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import lod_rank_table +from paddle.fluid.layers.control_flow import max_sequence_len +from paddle.fluid.layers.control_flow import lod_tensor_to_array +from paddle.fluid.layers.control_flow import array_to_lod_tensor + class TestCPULoDTensorArrayOps(unittest.TestCase): def place(self): @@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): with program_guard(program): x = layers.data(name='x', shape=[10]) x.persistable = True - table = layers.lod_rank_table(x, level=level) - max_len = layers.max_sequence_len(table) + table = lod_rank_table(x, level=level) + max_len = max_sequence_len(table) max_len.persistable = True - array = layers.lod_tensor_to_array(x, table) + array = lod_tensor_to_array(x, table) array.persistable = True - result = layers.array_to_lod_tensor(array, table) + result = array_to_lod_tensor(array, table) result.persistable = True exe = Executor(place) scope = core.Scope() @@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): with program_guard(program): x = layers.data( name='x', shape=[1], dtype='float32', stop_gradient=False) - table = layers.lod_rank_table(x, level=0) - array = layers.lod_tensor_to_array(x, table) - result = layers.array_to_lod_tensor(array, table) + table = lod_rank_table(x, level=0) + array = lod_tensor_to_array(x, table) + result = array_to_lod_tensor(array, table) mean = layers.mean(result) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index b21e16439a5070e5f6d763e1617d4cfffe8bd618..76389d916fc39f470a22aed4792bf7b754600436 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase): label = np.ones(shape=[32, 1], dtype='int64') return img, label - # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True): if use_cuda and not core.is_compiled_with_cuda(): return - self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( - simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) - - img, label = self._init_data() - + model, use_cuda=use_cuda, use_reduce=True) self.check_network_convergence( - simple_fc_net, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=use_reduce) + model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True) - def check_simple_fc_convergence_with_Reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - simple_fc_net, use_cuda=use_cuda, use_reduce=True) - self.check_network_convergence( - simple_fc_net, - use_cuda=use_cuda, - allow_op_delay=True, - use_reduce=True) - - img, label = self._init_data() + img, label = self._init_data(random_data) all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - simple_fc_net, + model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) reduce_first_loss, reduce_last_loss = self.check_network_convergence( - simple_fc_net, + model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, @@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase): for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-4) + + # simple_fc + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return + self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) + + img, label = self._init_data() + + self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=use_reduce) def test_simple_fc(self): # use_cuda @@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce - self.check_simple_fc_convergence_with_Reduce(True) - self.check_simple_fc_convergence_with_Reduce(False) + self._compare_reduce_and_allreduce(simple_fc_net, True) + self._compare_reduce_and_allreduce(simple_fc_net, False) def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): @@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda) - def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True) - - img, label = self._init_data() - - all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=False) - reduce_first_loss, reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=True) - - for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) - for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-4) - def test_batchnorm_fc(self): self.check_batchnorm_fc_convergence(True) self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence_use_reduce(True) - self.check_batchnorm_fc_convergence_use_reduce(False) + self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + self._compare_reduce_and_allreduce(fc_with_batchnorm, False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py index 9ec05e02973138e3ec233ef07f98afd598ec86b1..18309f457704f522457daefdb8464ae5df2ffcfb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py @@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase): pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) data = next(generator) - if isinstance(data, fluid.Variable): + if isinstance(data, fluid.framework.Variable): data = [data] with pd.do(): diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py index a70321bd800bf25eeb9e5d197ea7e08626b9aede..6e1cd56b3e309fc014dc981a1e3aa841159fca15 100644 --- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py @@ -15,6 +15,7 @@ import unittest import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid.layers.control_flow import lod_rank_table import numpy @@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase): dat.stop_gradient = False rank_dat = fluid.layers.data( name=cls.data_desc[1][0], shape=cls.data_desc[1][1]) - table = fluid.layers.lod_rank_table(rank_dat) + table = lod_rank_table(rank_dat) new_dat = fluid.layers.reorder_lod_tensor_by_rank( x=dat, rank_table=table) loss = fluid.layers.reduce_sum(new_dat) diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py index 24bc2cbaf86e8ed2c6a359c4c4d9a1e1507df746..6f0e337034d1010880514181654170316fd9db19 100644 --- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py +++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py @@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program from paddle.fluid.framework import Program import numpy as np +from paddle.fluid.layers.control_flow import shrink_memory +from paddle.fluid.layers.control_flow import lod_rank_table + class TestShrinkRNNMemoryBase(unittest.TestCase): def setUp(self): @@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): x.stop_gradient = False rank_table_tensor = layers.data( 'rank_table_tensor', shape=[1], dtype='float32', lod_level=1) - table = layers.lod_rank_table(x=rank_table_tensor) + table = lod_rank_table(x=rank_table_tensor) i = layers.zeros(dtype='int64', shape=[1]) - self.mem1 = layers.shrink_memory(x=x, i=i, table=table) + self.mem1 = shrink_memory(x=x, i=i, table=table) i = layers.increment(x=i) i.stop_gradient = True - self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table) + self.mem2 = shrink_memory(x=self.mem1, i=i, table=table) i = layers.increment(x=i) i.stop_gradient = True - self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table) + self.mem3 = shrink_memory(x=self.mem2, i=i, table=table) mem3_mean = layers.mean(self.mem3) append_backward(loss=mem3_mean) self.x_grad = self.main_program.global_block().var('x@GRAD') diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py index 0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9..ea1146166d34a31efbd859318b411cea50895fe1 100644 --- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py @@ -19,6 +19,8 @@ import paddle.fluid.layers as layers from paddle.fluid.framework import Program, program_guard from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import split_lod_tensor +from paddle.fluid.layers.control_flow import merge_lod_tensor class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): y = layers.data(name='y', shape=[1]) y.persistable = True - out_true, out_false = layers.split_lod_tensor( - input=x, mask=y, level=level) + out_true, out_false = split_lod_tensor(input=x, mask=y, level=level) out_true.persistable = True out_false.persistable = True - out = layers.merge_lod_tensor( + out = merge_lod_tensor( in_true=out_true, in_false=out_false, mask=y, x=x, level=level) out.persistable = True @@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): level = 0 - out_true, out_false = layers.split_lod_tensor( - input=x, mask=y, level=level) - out = layers.merge_lod_tensor( + out_true, out_false = split_lod_tensor(input=x, mask=y, level=level) + out = merge_lod_tensor( in_true=out_true, in_false=out_false, mask=y, x=x, level=level) mean = layers.mean(out) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 22a461125e9f9ae7a8a6b51d02774aaa163f7b2a..2dd9ec3e473e538cce19e9c6698ac4a2f76bcba7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework from ..framework import Program, default_main_program, \ default_startup_program, Block, \ - Variable, Parameter, grad_var_name + Parameter, grad_var_name from details import * LOOKUP_TABLE_TYPE = "lookup_table" @@ -918,7 +918,8 @@ class DistributeTranspiler(object): # create table optimize block in pserver program table_opt_op = [ op for op in self.optimize_ops - if op.input("Param")[0] == self.table_name + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name ][0] table_opt_block = pserver_program.create_block(pre_block_idx) # only support sgd now @@ -1075,7 +1076,6 @@ class DistributeTranspiler(object): ] def _clone_var(self, block, var, persistable=True): - assert isinstance(var, Variable) return block.create_var( name=var.name, shape=var.shape, diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 353c82f71632c0fa398bcfcf836cc382e7e501f7..0ca5cf813b51e200da5edd5830767ad9457acec2 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,7 +14,7 @@ from collections import defaultdict from .. import core -from ..framework import Program, default_main_program, Parameter, Variable +from ..framework import Program, default_main_program, Parameter from ..backward import _rename_arg_ dtype_to_size = {