diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 08147fdccd51f6899765cae3b5109e68ed27e936..4bdfad93187b0d7e18c4408a578cf5d7ad872b28 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -200,31 +200,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) -paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) @@ -233,9 +225,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index bb603efaf30bb72d74b5583abc45d01a16c076a3..409efac6799b6fb8d27a1343a55e7a508760868f 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, for (int h = 0; h < shape.h(); ++h) { for (int w = 0; w < shape.w(); ++w) { odata[h * ostrides.h() + w * ostrides.w()] = - idata[h * ostrides.h() + w * ostrides.w()]; + idata[h * istrides.h() + w * istrides.w()]; } } } - +// indata c * k // Reorder the data layout from CK to KC. void ReorderCKtoKC(TensorRTEngine::Weight& iweights, TensorRTEngine::Weight* oweights) { @@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter { framework::LoDTensor tmp; tmp.Resize(Y_t->dims()); - memcpy(tmp.mutable_data(platform::CPUPlace()), Y_t->data(), - Y_t->dims()[0] * Y_t->dims()[1]); - + memcpy(tmp.mutable_data(platform::CPUPlace()), weight_data, + Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), Y_t->memory_size() / sizeof(float)}; @@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter { // The data layout of TRT FC layer's weight is different from fluid's FC, // need to reorder the elements. - ReorderCKtoKC(tmp_weight, &weight); + ReorderCKtoKC(weight, &tmp_weight); // Currently, the framework can only handle one fluid op -> one TRT layer, // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just @@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter { auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *const_cast(X), - n_output, weight.get(), bias.get()); + n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); engine_->SetITensor(output_name, layer->getOutput(0)); @@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter { } // namespace paddle REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); -USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 0a02a7bebf9efbd0555707e6cfa701ef1e7d9659..7dabfd9f6a9a8cfbdd1d9a66541180d3499b7bdc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index a30253072ac581ceca85ca10151a176f87a7cb39..081f4d605975f1408d4d8a8ed3108c04d837a4de 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -23,11 +23,11 @@ namespace tensorrt { TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; - TRTConvertValidation validator(20, parameters, scope, 1000); - - validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); - validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); - validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1)); + validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); // Prepare Op description framework::OpDesc desc; @@ -38,9 +38,10 @@ TEST(fc_op, test) { validator.SetOp(*desc.Proto()); - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt } // namespace inference } // namespace paddle +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 1ce1130e5d660d717a1262a1fbdb4b620462c0b3..674f37f2fdddf013a8f6f4671debbc19c3322423 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(10); + validator.Execute(1); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 3b1f531adc5d756259df1c350f7f44bf71ee1f93..f14885b238134cdf38a278cd8a0734947bcacfe0 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -39,7 +39,7 @@ namespace tensorrt { float random(float low, float high) { static std::random_device rd; static std::mt19937 mt(rd()); - std::uniform_real_distribution dist(1.0, 10.0); + std::uniform_real_distribution dist(low, high); return dist(mt); } @@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, size_t num_elements = analysis::AccuDims(dims, dims.size()); PADDLE_ENFORCE_GT(num_elements, 0); auto* data = tensor->mutable_data(place); + for (size_t i = 0; i < num_elements; i++) { *(data + i) = random(0., 1.); } @@ -68,7 +69,7 @@ class TRTConvertValidation { int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. - engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); + engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_)); engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); @@ -138,12 +139,11 @@ class TRTConvertValidation { cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 200; + const size_t output_space_size = 2000; for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], - output_space_size * sizeof(float)); + engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); cudaStreamSynchronize(*engine_->stream()); auto* var = scope_.FindVar(output); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index fefec0df6d03669a294ce9643b666d7416593708..b821c3d0bf425c46fae634fbf53f7ee63100ca5c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -1,7 +1,7 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -26,6 +26,8 @@ namespace paddle { namespace inference { namespace tensorrt { +int TensorRTEngine::runtime_batch_ = 1; + void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } @@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) { PADDLE_ENFORCE_NOT_NULL(stream_); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); cudaStreamSynchronize(*stream_); + SetRuntimeBatch(batch_size); } TensorRTEngine::~TensorRTEngine() { @@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() { auto dims = infer_engine_->getBindingDimensions(slot_offset); item.second = kDataTypeSize[static_cast( infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims); + analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; PADDLE_ENFORCE_GT(item.second, 0); } auto &buf = buffer(item.first); buf.max_size = item.second * max_batch_; CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size)); - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - // buf.size will changed in the runtime. + + PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); buf.size = 0; + PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G buf.device = DeviceType::GPU; } } @@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, auto *input = infer_network_->addInput(name.c_str(), dtype, dims); PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * - analysis::AccuDims(dims.d, dims.nbDims); + analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; PADDLE_ENFORCE(input->isNetworkInput()); TensorRTEngine::SetITensor(name, input); return input; @@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, size_t max_size) { // determine data size + auto *output = TensorRTEngine::GetITensor(name); + nvinfer1::Dims dims = output->getDimensions(); + auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); + size_t dst_size = dim_size * runtime_batch_ * + kDataTypeSize[static_cast(output->getType())]; + auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_GE(max_size, it->second); + PADDLE_ENFORCE_LE(dst_size, it->second); + PADDLE_ENFORCE_GE(max_size, dst_size); auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, + PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, cudaMemcpyDeviceToDevice, *stream_), 0); } void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, size_t max_size) { - VLOG(4) << "get output in cpu"; - auto &buf = buffer(name); - - // Update needed buffer size. - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - buf.size = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims); - PADDLE_ENFORCE_LE(buf.size, buf.max_size); // determine data size + + auto *output = TensorRTEngine::GetITensor(name); + nvinfer1::Dims dims = output->getDimensions(); + auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); + size_t dst_size = dim_size * runtime_batch_ * + kDataTypeSize[static_cast(output->getType())]; + auto it = buffer_sizes_.find(name); + PADDLE_ENFORCE(it != buffer_sizes_.end()); + PADDLE_ENFORCE_GT(it->second, 0); + PADDLE_ENFORCE_LE(dst_size, it->second); + PADDLE_ENFORCE_GE(max_size, dst_size); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - // DEBUG - memset(dst, 0, buf.size); - PADDLE_ENFORCE_EQ( - 0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost)); + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, + cudaMemcpyDeviceToHost, *stream_)); } Buffer &TensorRTEngine::buffer(const std::string &name) { @@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { return itensor_map_[name]; } +void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { + runtime_batch_ = batch_size; +} + +int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 7064d333f6db754f88c0ac6956a9527a48bf866c..694468c419c20089de1cdecff1a903ad0cc6e99f 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase { nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + void SetRuntimeBatch(size_t batch_size); + int GetRuntimeBatch(); private: // the max batch size int max_batch_; + // the runtime batch size + static int runtime_batch_; // the max memory size the engine uses int max_workspace_; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index fca3488008ed83418b5e28b8af42d8019aaaa2a4..f8732e51b66bdc78aa35d06ba9651f1942a74b01 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(1, 1 << 10, &stream_); + engine_ = new TensorRTEngine(10, 1 << 10, &stream_); engine_->InitNetwork(); } @@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { LOG(INFO) << "to get output"; float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); + engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); LOG(INFO) << "to checkout output"; ASSERT_EQ(y_cpu, x_v * 2 + 3); @@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { LOG(INFO) << "to get output"; float y_cpu[2] = {-1., -1.}; + auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); + engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } +TEST_F(TensorRTEngineTest, test_conv2d_temp) { + // Weight in CPU memory. + float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + float raw_bias[1] = {0}; + + TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); + TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); + auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 3, 3}); + auto* conv_layer = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, + weight.get(), bias.get()); + PADDLE_ENFORCE(conv_layer != nullptr); + conv_layer->setStride(nvinfer1::DimsHW{1, 1}); + conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); + + engine_->DeclareOutput(conv_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), + 18 * sizeof(float)); + engine_->Execute(2); + + LOG(INFO) << "to get output"; + float* y_cpu = new float[18]; + engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.0); + ASSERT_EQ(y_cpu[1], 6.0); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 43672d6db92a981f0fbe6e8f7079dafc6ae4052e..db641a4bc2c637e0babee6b6bc6e67b068759ff5 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); + switch (shape.size()) { case 2: - return nvinfer1::Dims2(shape[0], shape[1]); + return nvinfer1::Dims2(1, shape[1]); case 3: - return nvinfer1::Dims3(shape[0], shape[1], shape[2]); + return nvinfer1::Dims3(1, shape[1], shape[2]); case 4: - return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); + return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]); default: return nvinfer1::Dims(); } diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index a332d70030ffa6a033f6b2b33487a4fd279b7016..32d10fd8a5687ebaae1d7d75af531cbc45ef4245 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* fluid_v = context.scope().FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto* fluid_t = fluid_v->GetMutable(); - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); + fluid_t->Resize(framework::make_ddim(ddim)); // TODO(Superjomn) find some way to determine which device to output the // tensor. // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. + auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * + FLAGS_tensorrt_engine_batch_size; engine->GetOutputInCPU(y, fluid_t->mutable_data(platform::CPUPlace()), size * sizeof(float)); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 82a16361e40513aeaf6f510e450f58989369fcdb..7cb1e47a1516c32fb31a7818e7203b498e31e431 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) { LOG(INFO) << "create block desc"; framework::BlockDesc block_desc(&program, block_); - LOG(INFO) << "create mul op"; - auto* mul = block_desc.AppendOp(); - mul->SetType("mul"); - mul->SetInput("X", std::vector({"x"})); // 2 x 4 - mul->SetInput("Y", std::vector({"y"})); // 4 x 6 - mul->SetOutput("Out", std::vector({"z"})); // 2 x 6 + LOG(INFO) << "create fc op"; + auto* fc0 = block_desc.AppendOp(); + fc0->SetType("fc"); + fc0->SetInput("X", std::vector({"x"})); // 4 x 1 x 1 + fc0->SetInput("Y", std::vector({"y"})); // 4 x 6 + fc0->SetOutput("Out", std::vector({"z"})); // 6 x 1 x 1 LOG(INFO) << "create fc op"; - auto* fc = block_desc.AppendOp(); - fc->SetType("mul"); - fc->SetInput("X", std::vector({"z"})); - fc->SetInput("Y", std::vector({"y0"})); // 6 x 8 - fc->SetOutput("Out", std::vector({"z0"})); // 2 x 8 + auto* fc1 = block_desc.AppendOp(); + fc1->SetType("fc"); + fc1->SetInput("X", std::vector({"z"})); + fc1->SetInput("Y", std::vector({"y0"})); // 6 x 8 + fc1->SetOutput("Out", std::vector({"z0"})); // 8 x 1 x 1 // Set inputs' variable shape in BlockDesc - AddTensorToBlockDesc(block_, "x", std::vector({2, 4})); + // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1} + AddTensorToBlockDesc(block_, "x", std::vector({2, 4, 1, 1})); AddTensorToBlockDesc(block_, "y", std::vector({4, 6})); AddTensorToBlockDesc(block_, "y0", std::vector({6, 8})); AddTensorToBlockDesc(block_, "z", std::vector({2, 6})); // It is wired, need to copy manually. - *block_->add_ops() = *mul->Proto(); - *block_->add_ops() = *fc->Proto(); + *block_->add_ops() = *fc0->Proto(); + *block_->add_ops() = *fc1->Proto(); ASSERT_EQ(block_->ops_size(), 2); LOG(INFO) << "create tensorrt desc"; framework::OpDesc engine_op_desc(nullptr); engine_op_desc.SetType("tensorrt_engine"); - engine_op_desc.SetInput("Xs", std::vector({"x", "y", "y0"})); + engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); @@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } } // namespace operators } // namespace paddle -USE_TRT_CONVERTER(mul) USE_TRT_CONVERTER(fc) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 79a11a30d602cb33c2583873e0d0f2e15e0fcb8c..f05ae6d5d1900560e37370121bf64f1fcab14357 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or import numpy __all__ = [ - 'split_lod_tensor', - 'merge_lod_tensor', 'While', 'Switch', - 'lod_rank_table', - 'max_sequence_len', - 'lod_tensor_to_array', - 'array_to_lod_tensor', 'increment', 'array_write', 'create_array', 'less_than', 'equal', 'array_read', - 'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', - 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank', 'ParallelDo', @@ -1457,7 +1449,7 @@ class IfElse(object): if self.status == IfElse.OUT_IF_ELSE_BLOCKS: raise ValueError("input must in true/false blocks") if id(x) not in self.input_table: - parent_block = self.parent_block() + parent_block = self._parent_block() out_true = parent_block.create_var( name=unique_name.generate('ifelse_input' + self.helper.name), dtype=x.dtype) @@ -1483,7 +1475,7 @@ class IfElse(object): else: return out_false - def parent_block(self): + def _parent_block(self): current_block = self.helper.main_program.current_block() return self.helper.main_program.block(current_block.parent_idx) @@ -1499,7 +1491,7 @@ class IfElse(object): out_table = self.output_table[1 if self.status == self.IN_IF_ELSE_TRUE_BLOCKS else 0] - parent_block = self.parent_block() + parent_block = self._parent_block() for each_out in outs: if not isinstance(each_out, Variable): raise TypeError("Each output should be a variable") diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py index 1b58925599de62510ea9048f5210bb0b7e49f933..799c31dfe5161ff6aef47601f1b6f6e38885760b 100644 --- a/python/paddle/fluid/tests/test_if_else_op.py +++ b/python/paddle/fluid/tests/test_if_else_op.py @@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor from paddle.fluid.optimizer import MomentumOptimizer import paddle.fluid.core as core import paddle.fluid as fluid +from paddle.fluid.layers.control_flow import split_lod_tensor +from paddle.fluid.layers.control_flow import merge_lod_tensor +from paddle.fluid.layers.control_flow import ConditionalBlock + import unittest import numpy as np @@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase): limit = layers.fill_constant(shape=[1], dtype='int64', value=5) cond = layers.less_than(x=label, y=limit) - true_image, false_image = layers.split_lod_tensor( - input=image, mask=cond) + true_image, false_image = split_lod_tensor(input=image, mask=cond) true_out = layers.create_tensor(dtype='float32') - true_cond = layers.ConditionalBlock([cond]) + true_cond = ConditionalBlock([cond]) with true_cond.block(): hidden = layers.fc(input=true_image, size=100, act='tanh') @@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase): layers.assign(input=prob, output=true_out) false_out = layers.create_tensor(dtype='float32') - false_cond = layers.ConditionalBlock([cond]) + false_cond = ConditionalBlock([cond]) with false_cond.block(): hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=false_out) - prob = layers.merge_lod_tensor( + prob = merge_lod_tensor( in_true=true_out, in_false=false_out, mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) avg_loss = layers.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py index 084b8d37386fac0366c190f5f30dd39467072498..d9f83905e6135e22f74e749857f9b0fbe464d3f4 100644 --- a/python/paddle/fluid/tests/unittests/test_conditional_block.py +++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py @@ -18,14 +18,15 @@ import paddle.fluid.core as core from paddle.fluid.framework import default_startup_program, default_main_program from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import ConditionalBlock import numpy -class ConditionalBlock(unittest.TestCase): +class ConditionalBlockTest(unittest.TestCase): def test_forward(self): data = layers.data(name='X', shape=[1], dtype='float32') data.stop_gradient = False - cond = layers.ConditionalBlock(inputs=[data]) + cond = ConditionalBlock(inputs=[data]) out = layers.create_tensor(dtype='float32') with cond.block(): hidden = layers.fc(input=data, size=10) diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py index d1075d514e9b2b692f271f10a005815a66b421fb..58ac6fa0a9a30a08a831111513777cca59062724 100644 --- a/python/paddle/fluid/tests/unittests/test_const_value.py +++ b/python/paddle/fluid/tests/unittests/test_const_value.py @@ -16,7 +16,7 @@ import unittest import paddle.fluid.framework as framework -class ConditionalBlock(unittest.TestCase): +class ConstantTest(unittest.TestCase): def test_const_value(self): self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD") self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@") diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index 0faed94deb4808783027d776e0f4c61da0db457a..4448de8839d7ad4ad1f70ecdc4ac94da1e619adb 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -17,6 +17,12 @@ import paddle import unittest import numpy +from paddle.fluid.layers.control_flow import lod_rank_table +from paddle.fluid.layers.control_flow import max_sequence_len +from paddle.fluid.layers.control_flow import lod_tensor_to_array +from paddle.fluid.layers.control_flow import array_to_lod_tensor +from paddle.fluid.layers.control_flow import shrink_memory + class TestDynRNN(unittest.TestCase): def setUp(self): @@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase): label = fluid.layers.data(name='label', shape=[1], dtype='float32') - rank_table = fluid.layers.lod_rank_table(x=sent_emb) + rank_table = lod_rank_table(x=sent_emb) - sent_emb_array = fluid.layers.lod_tensor_to_array( - x=sent_emb, table=rank_table) + sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table) - seq_len = fluid.layers.max_sequence_len(rank_table=rank_table) + seq_len = max_sequence_len(rank_table=rank_table) i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) i.stop_gradient = False @@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase): mem = fluid.layers.array_read(array=mem_array, i=i) ipt = fluid.layers.array_read(array=sent_emb_array, i=i) - mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) + mem = shrink_memory(x=mem, i=i, table=rank_table) hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') @@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase): fluid.layers.array_write(x=hidden, i=i, array=mem_array) fluid.layers.less_than(x=i, y=seq_len, cond=cond) - all_timesteps = fluid.layers.array_to_lod_tensor( - x=out, table=rank_table) + all_timesteps = array_to_lod_tensor(x=out, table=rank_table) last = fluid.layers.sequence_last_step(input=all_timesteps) logits = fluid.layers.fc(input=last, size=1, act=None) loss = fluid.layers.sigmoid_cross_entropy_with_logits( diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py index bac5e502318397b43e9867d5fc9e4e8cd33394b8..16e85830ffa51ec428951570cc7a038f3d10c873 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py +++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.layers import lod_rank_table, data +from paddle.fluid.layers import data +from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.executor import Executor import paddle.fluid.core as core import numpy diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py index cebe6997bb4152519dabbabfc0404d6036bc4e65..5a4580116bc7009c73f1de14a265bf2cea5acf9b 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py @@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import lod_rank_table +from paddle.fluid.layers.control_flow import max_sequence_len +from paddle.fluid.layers.control_flow import lod_tensor_to_array +from paddle.fluid.layers.control_flow import array_to_lod_tensor + class TestCPULoDTensorArrayOps(unittest.TestCase): def place(self): @@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): with program_guard(program): x = layers.data(name='x', shape=[10]) x.persistable = True - table = layers.lod_rank_table(x, level=level) - max_len = layers.max_sequence_len(table) + table = lod_rank_table(x, level=level) + max_len = max_sequence_len(table) max_len.persistable = True - array = layers.lod_tensor_to_array(x, table) + array = lod_tensor_to_array(x, table) array.persistable = True - result = layers.array_to_lod_tensor(array, table) + result = array_to_lod_tensor(array, table) result.persistable = True exe = Executor(place) scope = core.Scope() @@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): with program_guard(program): x = layers.data( name='x', shape=[1], dtype='float32', stop_gradient=False) - table = layers.lod_rank_table(x, level=0) - array = layers.lod_tensor_to_array(x, table) - result = layers.array_to_lod_tensor(array, table) + table = lod_rank_table(x, level=0) + array = lod_tensor_to_array(x, table) + result = array_to_lod_tensor(array, table) mean = layers.mean(result) diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py index a70321bd800bf25eeb9e5d197ea7e08626b9aede..6e1cd56b3e309fc014dc981a1e3aa841159fca15 100644 --- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py @@ -15,6 +15,7 @@ import unittest import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid.layers.control_flow import lod_rank_table import numpy @@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase): dat.stop_gradient = False rank_dat = fluid.layers.data( name=cls.data_desc[1][0], shape=cls.data_desc[1][1]) - table = fluid.layers.lod_rank_table(rank_dat) + table = lod_rank_table(rank_dat) new_dat = fluid.layers.reorder_lod_tensor_by_rank( x=dat, rank_table=table) loss = fluid.layers.reduce_sum(new_dat) diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py index 24bc2cbaf86e8ed2c6a359c4c4d9a1e1507df746..6f0e337034d1010880514181654170316fd9db19 100644 --- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py +++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py @@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program from paddle.fluid.framework import Program import numpy as np +from paddle.fluid.layers.control_flow import shrink_memory +from paddle.fluid.layers.control_flow import lod_rank_table + class TestShrinkRNNMemoryBase(unittest.TestCase): def setUp(self): @@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): x.stop_gradient = False rank_table_tensor = layers.data( 'rank_table_tensor', shape=[1], dtype='float32', lod_level=1) - table = layers.lod_rank_table(x=rank_table_tensor) + table = lod_rank_table(x=rank_table_tensor) i = layers.zeros(dtype='int64', shape=[1]) - self.mem1 = layers.shrink_memory(x=x, i=i, table=table) + self.mem1 = shrink_memory(x=x, i=i, table=table) i = layers.increment(x=i) i.stop_gradient = True - self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table) + self.mem2 = shrink_memory(x=self.mem1, i=i, table=table) i = layers.increment(x=i) i.stop_gradient = True - self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table) + self.mem3 = shrink_memory(x=self.mem2, i=i, table=table) mem3_mean = layers.mean(self.mem3) append_backward(loss=mem3_mean) self.x_grad = self.main_program.global_block().var('x@GRAD') diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py index 0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9..ea1146166d34a31efbd859318b411cea50895fe1 100644 --- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py @@ -19,6 +19,8 @@ import paddle.fluid.layers as layers from paddle.fluid.framework import Program, program_guard from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward +from paddle.fluid.layers.control_flow import split_lod_tensor +from paddle.fluid.layers.control_flow import merge_lod_tensor class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): y = layers.data(name='y', shape=[1]) y.persistable = True - out_true, out_false = layers.split_lod_tensor( - input=x, mask=y, level=level) + out_true, out_false = split_lod_tensor(input=x, mask=y, level=level) out_true.persistable = True out_false.persistable = True - out = layers.merge_lod_tensor( + out = merge_lod_tensor( in_true=out_true, in_false=out_false, mask=y, x=x, level=level) out.persistable = True @@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): level = 0 - out_true, out_false = layers.split_lod_tensor( - input=x, mask=y, level=level) - out = layers.merge_lod_tensor( + out_true, out_false = split_lod_tensor(input=x, mask=y, level=level) + out = merge_lod_tensor( in_true=out_true, in_false=out_false, mask=y, x=x, level=level) mean = layers.mean(out)