diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1d62792b80dd002b894da28be9162fc7d3ce054e..fac9f16a89bab311c338475aef7c79015ab466be 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -115,6 +115,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) # cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) +cc_test(rw_lock_test SRCS rw_lock_test.cc) + # disable test temporarily. # TODO https://github.com/PaddlePaddle/Paddle/issues/11971 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h index edbe45acb98326ee3bf1d86495832ec8469b634e..f42bab20ed97e372d2da0c4a492a4458ab94e0a0 100644 --- a/paddle/fluid/framework/ir/graph_traits.h +++ b/paddle/fluid/framework/ir/graph_traits.h @@ -12,7 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include +#include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index 2a4009b765e35c6cef8dc9ffb3e04a50f0dc2b38..1418fb5134fdde2392da912b5f1bd9fc74e58400 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ limitations under the License. */ #include +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..16f9cbb65229f10912ee90436c3557aaaca169b8 --- /dev/null +++ b/paddle/fluid/framework/rw_lock_test.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/rw_lock.h" +#include +#include // NOLINT +#include // NOLINT +#include + +namespace f = paddle::framework; + +void f1(f::RWLock *lock) { + lock->RDLock(); + lock->UNLock(); +} + +TEST(RWLOCK, read_read) { + f::RWLock lock; + lock.RDLock(); + std::thread t1(f1, &lock); + std::thread t2(f1, &lock); + t1.join(); + t2.join(); + lock.UNLock(); +} + +void f2(f::RWLock *lock, std::vector *result) { + lock->RDLock(); + ASSERT_EQ(result->size(), 0UL); + lock->UNLock(); +} + +void f3(f::RWLock *lock, std::vector *result) { + lock->WRLock(); + result->push_back(1); + lock->UNLock(); +} + +TEST(RWLOCK, read_write) { + f::RWLock lock; + std::vector result; + + lock.RDLock(); + std::thread t1(f2, &lock, &result); + t1.join(); + std::thread t2(f3, &lock, &result); + std::this_thread::sleep_for(std::chrono::seconds(1)); + ASSERT_EQ(result.size(), 0UL); + lock.UNLock(); + t2.join(); + ASSERT_EQ(result.size(), 1UL); +} + +void f4(f::RWLock *lock, std::vector *result) { + lock->RDLock(); + ASSERT_EQ(result->size(), 1UL); + lock->UNLock(); +} + +TEST(RWLOCK, write_read) { + f::RWLock lock; + std::vector result; + + lock.WRLock(); + std::thread t1(f4, &lock, &result); + std::this_thread::sleep_for(std::chrono::seconds(1)); + result.push_back(1); + lock.UNLock(); + t1.join(); +} diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 27fe575cb6167a726ff92a8f3d2e47b6f536ba39..b972efe5b0a6942bf0710755bbabbaf863477931 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -8,7 +8,7 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph helper.cc model_store_pass.cc DEPS framework_proto proto_desc) -cc_test(test_node SRCS node_tester.cc DEPS analysis) +cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 18c32fa09199003f17183207828cdfe4e627ae1a..f40d471cbfcb765895d911c7b36680271e2f0df0 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -23,7 +23,7 @@ namespace paddle { namespace inference { -DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_max_batchsize, 1, "TensorRT maximum batch size"); DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); namespace analysis { @@ -52,7 +52,6 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { bool DataFlowGraphToFluidPass::Finalize() { return true; } void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { - FilterRedundantOutputOfSubGraph(graph); LOG(INFO) << "graph.inputs " << graph->inputs.size(); for (auto &node : GraphTraits(graph).nodes_in_TS()) { if (node.deleted()) continue; diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc index f2e918f3ff41d9db0c3ec38561015967bed26f4e..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644 --- a/paddle/fluid/inference/analysis/node.cc +++ b/paddle/fluid/inference/analysis/node.cc @@ -20,17 +20,6 @@ namespace paddle { namespace inference { namespace analysis { -template <> -std::string &NodeAttr::As() { - if (data_.empty()) { - type_index_ = std::type_index(typeid(std::string)); - } - PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string))); - return data_; -} - -std::string &NodeAttr::String() { return As(); } - std::vector Value::dot_attrs() const { return std::vector({Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"), diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h index 47e524bc5c4a6b1324d5f182053129311487522d..fb426fb893d12c017deda74fc05016053fbc6b1c 100644 --- a/paddle/fluid/inference/analysis/node.h +++ b/paddle/fluid/inference/analysis/node.h @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/device.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace inference { @@ -38,39 +39,35 @@ class NodeMap; // A helper class to maintain the status from Pass. struct NodeAttr { + using any_t = + boost::variant; // NOTE T should be a primary type or a struct combined by several primary // types. // NOTE the STL containers should not use here. // Some usages // Attr attr; // attr.Bool() = true; - bool &Bool() { return As(); } float &Float() { return As(); } int32_t &Int32() { return As(); } int64_t &Int64() { return As(); } void *&Pointer() { return As(); } - std::string &String(); + std::string &String() { return As(); } private: template T &As() { - // init storage in the first usage. - if (data_.empty()) { - VLOG(4) << "resize data to " << sizeof(T); - type_index_ = std::type_index(typeid(T)); - data_.resize(sizeof(T)); + if (type_index_ == typeid(NodeAttr)) { + type_index_ = typeid(T); + any_data_ = T(); + } else { + PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type"); } - PADDLE_ENFORCE(framework::IsType(type_index_), - "type not matched, origin is %s, want %s", - DataTypeNamer::Global().repr(type_index_), - DataTypeNamer::Global().repr()); - PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error"); - return *reinterpret_cast(&data_[0]); + return boost::get(any_data_); } private: - std::string data_; + any_t any_data_; std::type_index type_index_{typeid(NodeAttr)}; }; diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc index ea832a3a7e47758be9b6bd59a4325ddb576ec446..8bbcfff53741772ee3705e2efdf46a1b59ee02ab 100644 --- a/paddle/fluid/inference/analysis/node_tester.cc +++ b/paddle/fluid/inference/analysis/node_tester.cc @@ -20,6 +20,24 @@ namespace paddle { namespace inference { namespace analysis { +TEST(NodeAttr, bool) { + NodeAttr x; + x.Bool() = true; + ASSERT_EQ(x.Bool(), true); +} + +TEST(NodeAttr, int32) { + NodeAttr x; + x.Int32() = 32; + ASSERT_EQ(x.Int32(), 32); +} + +TEST(NodeAttr, string) { + NodeAttr x; + x.String() = "Hello"; + ASSERT_EQ(x.String(), "Hello"); +} + TEST(Node, Attr) { // Node is an abstract class, use Value instead for they share the same Attr // logic. @@ -27,6 +45,9 @@ TEST(Node, Attr) { auto* node = nodes.Create(Node::Type::kValue); node->attr("v0").Int32() = 2008; ASSERT_EQ(node->attr("v0").Int32(), 2008); + + node->attr("str").String() = "hello world"; + ASSERT_EQ(node->attr("str").String(), "hello world"); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 80809d4c43ca08298bad25cf614dcb4117d3f99a..9146c0e45e77b5f120d3be622f74e3008bca2b6f 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -153,6 +153,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { inlink_or_outlink_cleaner(o->inlinks); } } + FilterRedundantOutputOfSubGraph(graph_); } } // namespace analysis diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 3e829dd726b132844a45427b7b0b39eedf197496..7824ef2649af81a2390ff3bc537eb7c93c70e402 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -13,16 +13,22 @@ else use_gpu_list='false' fi +PREFIX=inference-vis-demos%2F +URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} + # download vis_demo data function download() { dir_name=$1 mkdir -p $dir_name cd $dir_name - wget -q ${URL_ROOT}$dir_name.tar.gz - tar xzf *.tar.gz + if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then + echo "${PREFIX}{dir_name}.tar.gz has been downloaded." + else + wget -q ${URL_ROOT}$dir_name.tar.gz + tar xzf *.tar.gz + fi cd .. } -URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F mkdir -p data cd data vis_demo_list='se_resnext50 ocr mobilenet' diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7..841a95db38ce7cf0cb5961ff04cb569ee2633e6f 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -35,12 +35,20 @@ class Conv2dOpConverter : public OpConverter { auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); - auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); - PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); - const int n_output = Y_t->dims()[0]; - const int filter_h = Y_t->dims()[2]; - const int filter_w = Y_t->dims()[3]; + platform::CPUPlace cpu_place; + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(Y_t->dims()); + TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); + + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); + + PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); + const int n_output = weight_tensor->dims()[0]; + const int filter_h = weight_tensor->dims()[2]; + const int filter_w = weight_tensor->dims()[3]; const int groups = boost::get(op_desc.GetAttr("groups")); const std::vector dilations = @@ -57,7 +65,7 @@ class Conv2dOpConverter : public OpConverter { TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + weight_tensor->memory_size() / sizeof(float)}; TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto* layer = TRT_ENGINE_ADD_LAYER( @@ -70,6 +78,8 @@ class Conv2dOpConverter : public OpConverter { layer->setNbGroups(groups); auto output_name = op_desc.Output("Output").front(); + engine_->weight_map[op_desc.Input("Filter").front()] = + std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 3744550f60a1696aedd8a3ecd24f1b21d22325b9..60a72b4eb5c75b5cd12305f13763a9a1a567213f 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { @@ -40,10 +39,17 @@ class ElementwiseWeightOpConverter : public OpConverter { auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); - auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + + platform::CPUPlace cpu_place; + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(Y_t->dims()); + TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; - std::vector dims_y = framework::vectorize2int(Y_t->dims()); + std::vector dims_y = framework::vectorize2int(weight_tensor->dims()); if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); } @@ -70,9 +76,9 @@ class ElementwiseWeightOpConverter : public OpConverter { PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); } - TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + TensorRTEngine::Weight shift_weights{ + nvinfer1::DataType::kFLOAT, static_cast(weight_data), + weight_tensor->memory_size() / sizeof(float)}; TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, @@ -82,6 +88,8 @@ class ElementwiseWeightOpConverter : public OpConverter { engine_, Scale, *const_cast(X), scale_mode, shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; + + engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 39fe1f609d7b94638506877fc301f19ef33ec8ac..ad98d85aae9cf594922aca00c43718ccfbce2278 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/inference/tensorrt/engine.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace inference { @@ -73,19 +68,26 @@ class FcOpConverter : public OpConverter { auto* Y_t = Y_v->GetMutable(); // This may trigger a GPU->CPU copy, because TRT's weight can only be // assigned from CPU memory, that can't be avoided. - auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); - PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix - size_t n_output = Y_t->dims()[1]; + platform::CPUPlace cpu_place; + framework::LoDTensor weight_tensor; + weight_tensor.Resize(Y_t->dims()); + TensorCopySync((*Y_t), cpu_place, &weight_tensor); - framework::LoDTensor tmp; - tmp.Resize(Y_t->dims()); - memcpy(tmp.mutable_data(platform::CPUPlace()), weight_data, + auto* weight_data = weight_tensor.mutable_data(platform::CPUPlace()); + + PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL); // a matrix + size_t n_output = weight_tensor.dims()[1]; + + std::unique_ptr tmp(new framework::LoDTensor()); + tmp->Resize(weight_tensor.dims()); + + memcpy(tmp->mutable_data(platform::CPUPlace()), weight_data, Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), Y_t->memory_size() / sizeof(float)}; TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, - static_cast(tmp.data()), + static_cast(tmp->data()), Y_t->memory_size() / sizeof(float)); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; @@ -106,6 +108,7 @@ class FcOpConverter : public OpConverter { auto output_name = op_desc.Output("Out").front(); engine_->SetITensor(output_name, layer->getOutput(0)); + engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); if (test_mode) { engine_->DeclareOutput(output_name); } diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 11cad95361867476c6f775af778015da37f1cfb1..73f1b28ddf73403862e55d102a259d7b6cf67b1f 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -33,6 +33,7 @@ class Pool2dOpConverter : public OpConverter { PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + bool global_pooling = boost::get(op_desc.GetAttr("global_pooling")); std::string pool_type = boost::get(op_desc.GetAttr("pooling_type")); std::vector ksize = @@ -42,7 +43,13 @@ class Pool2dOpConverter : public OpConverter { std::vector paddings = boost::get>(op_desc.GetAttr("paddings")); - const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + if (global_pooling == true) { + nvinfer1::Dims input_shape = input1->getDimensions(); + int nbDims = input_shape.nbDims; + nv_ksize.d[0] = input_shape.d[nbDims - 2]; + nv_ksize.d[1] = input_shape.d[nbDims - 1]; + } const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index d6651a5b244ba31a01220e6299cb2016ae61fe64..01d7f700da9cc67d0ebbd3d9649e3823f58a8811 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) { auto* x = scope.Var("conv2d-Y"); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); + x_tensor->mutable_data(platform::CUDAPlace(0)); OpConverter converter; converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope, diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2..aedd6b62df040eeee4e48f628128511cd8bf4439 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -20,7 +20,7 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(Pool2dOpConverter, main) { +void test_pool2d(bool global_pooling) { framework::Scope scope; std::unordered_set parameters; TRTConvertValidation validator(5, parameters, scope, 1 << 15); @@ -28,7 +28,10 @@ TEST(Pool2dOpConverter, main) { // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); - validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); + if (global_pooling) + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1)); + else + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; @@ -45,6 +48,7 @@ TEST(Pool2dOpConverter, main) { desc.SetAttr("ksize", ksize); desc.SetAttr("strides", strides); desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); @@ -53,6 +57,10 @@ TEST(Pool2dOpConverter, main) { validator.Execute(3); } +TEST(Pool2dOpConverter, normal) { test_pool2d(false); } + +TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); } + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 4265f33f28fe36b1745baf4761c3c85e3a281d6b..35ecfd02f43ca0715793a9b6b3997dbc943b5769 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" @@ -48,11 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, auto dims = tensor->dims(); size_t num_elements = analysis::AccuDims(dims, dims.size()); PADDLE_ENFORCE_GT(num_elements, 0); - auto* data = tensor->mutable_data(place); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(dims); + auto* temp_data = temp_tensor.mutable_data(cpu_place); for (size_t i = 0; i < num_elements; i++) { - *(data + i) = random(0., 1.); + *(temp_data + i) = random(0., 1.); } + + TensorCopySync(temp_tensor, place, tensor); } /* @@ -101,8 +108,8 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); @@ -141,7 +148,7 @@ class TRTConvertValidation { PADDLE_ENFORCE(var); auto tensor = var->GetMutable(); - engine_->SetInputFromCPU( + engine_->SetInputFromGPU( input, static_cast(tensor->data()), sizeof(float) * analysis::AccuDims(tensor->dims(), tensor->dims().size())); @@ -151,8 +158,8 @@ class TRTConvertValidation { void Execute(int batch_size) { // Execute Fluid Op PADDLE_ENFORCE_LE(batch_size, max_batch_size_); - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); op_->Run(scope_, place); // Execute TRT. engine_->Execute(batch_size); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index b821c3d0bf425c46fae634fbf53f7ee63100ca5c..14e9e14d33d637ee68e37593cc48721e5169499f 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -33,6 +33,7 @@ void TensorRTEngine::Build(const DescType &paddle_model) { } void TensorRTEngine::Execute(int batch_size) { + freshDeviceId(); batch_size_ = batch_size; std::vector buffers; for (auto &buf : buffers_) { @@ -60,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() { } void TensorRTEngine::FreezeNetwork() { + freshDeviceId(); PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); PADDLE_ENFORCE(infer_network_ != nullptr, @@ -241,6 +243,13 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_, count); + cudaSetDevice(device_); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 694468c419c20089de1cdecff1a903ad0cc6e99f..bd3ba4cea6551a7f6651e311e2649de191a6faa1 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -52,13 +53,15 @@ class TensorRTEngine : public EngineBase { }; TensorRTEngine(int max_batch, int max_workspace, - cudaStream_t* stream = nullptr, + cudaStream_t* stream = nullptr, int device = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), - logger_(logger) { - cudaStreamCreate(&default_stream_); + logger_(logger), + device_(device) { + freshDeviceId(); + cudaStreamCreate(stream_); } virtual ~TensorRTEngine(); @@ -119,6 +122,15 @@ class TensorRTEngine : public EngineBase { nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); + int GetDevice() { return device_; } + + // A pointer to CPU memory is needed of the TRT weight. + // Before TRT runs, fluid loads weight into GPU storage. + // so we need to copy the weights from GPU to CPU in our op converter. + // We use a map to store these weights for the weight memory is not released + // in advance, which affecting the construction of TRT Op. + std::unordered_map> + weight_map; private: // the max batch size @@ -140,6 +152,8 @@ class TensorRTEngine : public EngineBase { std::unordered_map buffer_sizes_; std::unordered_map itensor_map_; + // The specific GPU id that the TensorRTEngine bounded to. + int device_; // TensorRT related internal members template @@ -156,6 +170,10 @@ class TensorRTEngine : public EngineBase { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. @@ -188,8 +206,8 @@ class TRT_EngineManager { // Create or get an engine called `name` TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, - const std::string& name) { - auto* p = new TensorRTEngine(max_batch, max_workspace, stream); + const std::string& name, int gpu_device = 0) { + auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device); engines_[name].reset(p); return p; } diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index dc03702990587bf5e65d28da662d10df4d882110..da1f6535cb3b2476cd475797861d6d2bb6d88856 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -27,7 +27,7 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ASSERT_EQ(0, cudaStreamCreate(&stream_)); + // ASSERT_EQ(0, cudaStreamCreate(&stream_)); engine_ = new TensorRTEngine(10, 1 << 10, &stream_); engine_->InitNetwork(); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e8b5dec9d49f5613cec92441d19ab7dc1a1ad90c..e29fe2a42bd1aaee1ea8c01159e331cf47ca6b72 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -100,7 +100,8 @@ function(op_library TARGET) endif() # Define operators that don't need pybind here. - foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") + foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" +"tensor_array_read_write_op" "tensorrt_engine_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() @@ -248,6 +249,7 @@ op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 4a6ce938a5f337d035b21f562d46daf606236db0..a1f368e8690512cec2db7593aabc0279bbe174eb 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase { class RecvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddInput("X", "(Any) Dummy inputs, used for control dependency") + .AsDuplicable(); AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); AddComment(R"DOC( Recv operator diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 1866a86048acbefadcb4d82cd6309cd16f0352d6..14b07649c416ff1b671fc9b5ee4eb956b44570c5 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -37,22 +37,19 @@ class SendBarrierOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { std::vector eps = Attr>("endpoints"); - bool sync_mode = Attr("sync_mode"); distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); - VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; + VLOG(3) << "SendBarrierOp sync"; // need to wait before sending send_barrier message PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); - if (sync_mode) { - for (auto& ep : eps) { - VLOG(3) << "send barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); - } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (auto& ep : eps) { + VLOG(3) << "send barrier, ep: " << ep; + rpc_client->AsyncSendBatchBarrier(ep); } + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; @@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent. "(string vector, default 127.0.0.1:6164)" "Server endpoints to send variables to.") .SetDefault({"127.0.0.1:6164"}); - AddAttr("sync_mode", "work in sync_mode or not").SetDefault(true); } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 3cd42f2d059532b7090e66ce21de8e5cb014adf1..82a70e4bf13247d784371ffdf419c9f792d7f721 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { void Make() { AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( Send operator diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index ee3078876c15b06a887064f08dc0c05d450b5f77..4d930e9cec208dca2c0e8f0ff690a1b4730a156a 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -17,10 +17,6 @@ #include #include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/inference/tensorrt/engine.h" -#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { @@ -29,100 +25,6 @@ DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); namespace operators { -using inference::Singleton; -using inference::tensorrt::TRT_EngineManager; - -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - return TRT_DT::kINT32; - default: - return TRT_DT::kINT32; - } - PADDLE_THROW("unkown type"); - return TRT_DT::kINT32; -} - -nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, - "TensorRT' tensor input requires at least 2 dimensions"); - PADDLE_ENFORCE_LE(shape.size(), 4UL, - "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE_EQ(shape.size(), 4UL); - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); -} - -} // namespace - -template -void TensorRTEngineKernel::Prepare( - const framework::ExecutionContext &context) const { - VLOG(4) << "Prepare engine"; - // Get the ProgramDesc and pass to convert. - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch = context.Attr("max_batch"); - auto max_workspace = context.Attr("max_workspace"); - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto ¶m : params) { - parameters.insert(param); - } - - std::vector output_maps = - context.Attr>("output_name_mapping"); - - // TODO(Superjomn) replace this with a different stream - auto *engine = Singleton::Global().Create( - max_batch, max_workspace, nullptr /*engine hold its own stream*/, - context.Attr("engine_uniq_key")); - engine->InitNetwork(); - - framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); - // Add inputs - VLOG(4) << "declare inputs"; - for (auto &input : context.Inputs("Xs")) { - if (parameters.count(input)) continue; - VLOG(4) << "declare input " << input; - auto *var = block.FindVar(input); - // TensorRT engine need to create parameters. The parameter's description - // should be set in - PADDLE_ENFORCE(var, "no variable called %s", input); - PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, - "TensorRT engine only takes LoDTensor as input"); - auto shape = var->GetShape(); - // For the special batch_size placeholder -1, drop it and pass the real - // shape of data. - // TODO(Superjomn) fix this with batch broadcast, or it can't handle - // variational batch size. - if (shape[0] == -1) { - shape[0] = FLAGS_tensorrt_engine_batch_size; - } - engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(shape)); - } - - inference::Singleton::Global().ConvertBlock( - block_desc, parameters, context.scope(), engine); - - // Add outputs - for (auto &output : output_maps) { - engine->DeclareOutput(output); - } - - engine->FreezeNetwork(); -} - class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -150,11 +52,4 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); -REGISTER_OP_CPU_KERNEL( - tensorrt_engine, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel); - #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt_engine_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1ddfde6d51ef719ca0b89cf286b176195ee682a --- /dev/null +++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/tensorrt_engine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + tensorrt_engine, + ops::TensorRTEngineKernel, + ops::TensorRTEngineKernel, + ops::TensorRTEngineKernel, + ops::TensorRTEngineKernel); diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 2cbe1213a2f428a3ce56b06f97636baeb4b66c26..f2ec7f066aa514d1107af17f9155f2c630b0f3b3 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -19,8 +19,10 @@ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { @@ -29,6 +31,35 @@ DECLARE_int32(tensorrt_engine_batch_size); namespace operators { +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE_EQ(shape.size(), 4UL); + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); +} + +} // namespace + using inference::Singleton; using inference::tensorrt::TRT_EngineManager; @@ -47,7 +78,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { .FindVar(input0) ->GetMutable() ->type()), - platform::CPUPlace()); + ctx.GetPlace()); return kt; } }; @@ -94,7 +125,9 @@ class TensorRTEngineKernel : public framework::OpKernel { // Convert output tensor from engine to fluid int output_index = 0; + VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto& y : context.Outputs("Ys")) { + VLOG(4) << y; // convert output and copy to fluid. nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); @@ -113,9 +146,11 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) change this float to dtype size. auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * FLAGS_tensorrt_engine_batch_size; - engine->GetOutputInCPU(output_maps[output_index], - fluid_t->mutable_data(platform::CPUPlace()), - size * sizeof(float)); + engine->GetOutputInGPU( + output_maps[output_index], + fluid_t->mutable_data(platform::CUDAPlace( + boost::get(context.GetPlace()).device)), + size * sizeof(float)); //} else { // engine->GetOutputInGPU( // y, fluid_t->mutable_data(platform::CUDAPlace()), @@ -128,8 +163,67 @@ class TensorRTEngineKernel : public framework::OpKernel { } protected: - // Build the engine. - void Prepare(const framework::ExecutionContext& context) const; + void Prepare(const framework::ExecutionContext& context) const { + VLOG(4) << "Prepare engine"; + // Get the ProgramDesc and pass to convert. + framework::proto::BlockDesc block_desc; + block_desc.ParseFromString(context.Attr("subgraph")); + int max_batch = context.Attr("max_batch"); + auto max_workspace = context.Attr("max_workspace"); + auto params = context.Attr>("parameters"); + std::unordered_set parameters; + for (const auto& param : params) { + parameters.insert(param); + } + + std::vector output_maps = + context.Attr>("output_name_mapping"); + + // TODO(Superjomn) replace this with a different stream + auto* engine = Singleton::Global().Create( + max_batch, max_workspace, nullptr /*engine hold its own stream*/, + context.Attr("engine_uniq_key"), + boost::get(context.GetPlace()).device); + + engine->InitNetwork(); + + framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); + VLOG(4) << "parsed var size " << block.AllVars().size(); + // Add inputs + VLOG(4) << "declare inputs"; + for (auto& input : context.Inputs("Xs")) { + if (parameters.count(input)) continue; + VLOG(4) << "declare input " << input; + auto* var = block.FindVar(input); + // TensorRT engine need to create parameters. The parameter's description + // should be set in + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto shape = var->GetShape(); + // For the special batch_size placeholder -1, drop it and pass the real + // shape of data. + // TODO(Superjomn) fix this with batch broadcast, or it can't handle + // variational batch size. + if (shape[0] == -1) { + shape[0] = FLAGS_tensorrt_engine_batch_size; + } + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(shape)); + } + + inference::Singleton::Global() + .ConvertBlock(block_desc, parameters, context.scope(), engine); + + // Add outputs + for (auto& output : output_maps) { + engine->DeclareOutput(output); + } + + engine->FreezeNetwork(); + } }; } // namespace operators diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 37657fa0b0498986fe67027415279af1775e58b9..97c375361f42a61e8a2a626bcec2a91263ee2014 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -23,20 +23,20 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_CPU_ONLY_OP(tensorrt_engine); +USE_CUDA_ONLY_OP(tensorrt_engine); namespace paddle { namespace operators { namespace { -void CreateCPUTensor(framework::Scope* scope, const std::string& name, - const std::vector& shape) { +void CreateCUDATensor(framework::Scope* scope, const std::string& name, + const std::vector& shape) { auto* var = scope->Var(name); auto* tensor = var->GetMutable(); auto dims = framework::make_ddim(shape); tensor->Resize(dims); - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); inference::tensorrt::RandomizeTensor(tensor, place, ctx); } @@ -112,15 +112,15 @@ TEST(TensorRTEngineOp, manual) { LOG(INFO) << "engine_op " << engine_op.get(); framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); // Prepare variables. - CreateCPUTensor(&scope, "x", std::vector({2, 4})); - CreateCPUTensor(&scope, "y", std::vector({4, 6})); - CreateCPUTensor(&scope, "z", std::vector({2, 6})); + CreateCUDATensor(&scope, "x", std::vector({2, 4})); + CreateCUDATensor(&scope, "y", std::vector({4, 6})); + CreateCUDATensor(&scope, "z", std::vector({2, 6})); - CreateCPUTensor(&scope, "y0", std::vector({6, 8})); - CreateCPUTensor(&scope, "z0", std::vector({2, 8})); + CreateCUDATensor(&scope, "y0", std::vector({6, 8})); + CreateCUDATensor(&scope, "z0", std::vector({2, 8})); // Execute them. LOG(INFO) << "engine_op run"; @@ -130,8 +130,8 @@ TEST(TensorRTEngineOp, manual) { void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::ProgramDesc program; framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); @@ -165,10 +165,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { // Prepare variables. if (!x_created) { - CreateCPUTensor(&scope, x_name, std::vector(x_shape)); + CreateCUDATensor(&scope, x_name, std::vector(x_shape)); } - CreateCPUTensor(&scope, y_name, std::vector(y_shape)); - CreateCPUTensor(&scope, z_name, std::vector(z_shape)); + CreateCUDATensor(&scope, y_name, std::vector(y_shape)); + CreateCUDATensor(&scope, z_name, std::vector(z_shape)); // It is wired, need to copy manually. *block_->add_ops() = *fc->Proto(); diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 23457ff5fe1ec27094113ba0dde26adc64c716b5..9f504d14a8da116648483c0f64cb511b46e6a97e 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, #if CUDA_VERSION < 9000 return __shfl_down(val, delta, width); #else - return __shfl_down_sync(mask, val, delta, width); + return __shfl_down_sync(mask, val, static_cast(delta), width); #endif } @@ -46,9 +46,16 @@ template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - half tmp = static_cast(val); - __shfl_down(tmp, static_cast(delta), width); - return float16(tmp); + return float16( + __shfl_down(static_cast(val), static_cast(delta), width)); +} +#else +template <> +__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, + float16 val, int delta, + int width) { + return float16(__shfl_down_sync(mask, static_cast(val), + static_cast(delta), width)); } #endif diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68..ee45afab93d079374aefe366425502890854c28d 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) { cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); for (size_t i = 0; i < num / 2; ++i) { - // NOTE(dzhwinter): the float16 add has small underflow/overflow + // NOTE(dzhwinter): the float16 add has small truncate error. // so we use EXPECT_NEAR to check the result. EXPECT_NEAR(static_cast(out[i]), static_cast(AddFunctor()(r_in1[i], r_in2[i])), @@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) { TestUnalign(static_cast(1024), /*shift_bit*/ 3); TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 3); } + +// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ +template +static __forceinline__ __device__ T WarpReduceSum(T val) { + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +template +__forceinline__ __device__ T BlockReduce(T val) { + static __shared__ T shared[32]; // Shared mem for 32 partial sums + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = WarpReduceSum(val); // Each warp performs partial reduction + + if (lane == 0) shared[wid] = val; // Write reduced value to shared memory + + __syncthreads(); // Wait for all partial reductions + + // read from shared memory only if that warp existed + val = + (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast(0); + + if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp + + return val; +} + +template +__global__ void DeviceReduceSum(T* in, T* out, size_t N) { + T sum(0); + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { + sum += in[i]; + } + sum = BlockReduce(sum); + __syncthreads(); + if (threadIdx.x == 0) out[blockIdx.x] = sum; +} + +template +void TestReduce(size_t num, float atol = 0.01) { + T* in1; + T *d_in1, *d_in2; + size_t size = sizeof(T) * num; + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), sizeof(T)); + in1 = reinterpret_cast(malloc(size)); + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num; ++i) { + in1[i] = static_cast(dist(engine)); + } + auto out = std::accumulate(in1, in1 + num, static_cast(0)); + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + DeviceReduceSum<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); + cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(in1[0]), static_cast(out), atol); + free(in1); + cudaFree(d_in1); + cudaFree(d_in2); +} + +TEST(CudaShuffleSync, float16) { + TestReduce(10); + TestReduce(1000); + + // float16 will overflow or accumulate truncate errors in big size. + TestReduce(10); + TestReduce(100, /*atol error*/ 1.0); +} diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index 1283de9d957a46b848c7bb6caf9c5f49398468e2..622a2d51049d164b6e8423e4054081f40f190cb9 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -54,7 +54,7 @@ function cpu_config() { if [ $platform == "Linux" ]; then ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` elif [ $platform == "Darwin" ]; then - if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then + if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then # HT is OFF ht=1 fi diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py index 3038747bf8d03cf82613f1b329b7eeb778a0b9f2..38addd0cfd9bd0afde7eefc57f2111b717b7e636 100644 --- a/python/paddle/dataset/mnist.py +++ b/python/paddle/dataset/mnist.py @@ -24,7 +24,6 @@ import paddle.dataset.common import subprocess import numpy import platform -import six import tempfile from six.moves import range __all__ = ['train', 'test', 'convert'] diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 21a295a0982cbc51947a063beee542c13494024d..b03ee514f50f9a8c1425bd5b1d409b58ed62351a 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -24,7 +24,7 @@ from .layer_function_generator import templatedoc from .. import core from ..executor import global_scope from ..framework import convert_np_dtype_to_dtype_, default_main_program, \ - default_startup_program, program_guard, Program + default_startup_program, program_guard, Program, Variable from ..layer_helper import LayerHelper from ..unique_name import generate as unique_name @@ -209,7 +209,7 @@ class ListenAndServ(object): }) -def Send(endpoints, send_vars, sync=True): +def Send(endpoints, send_vars, dummy_output=None, sync=True): """ Send variables to the server side, and get vars from server side when server have finished running server side program. @@ -223,6 +223,13 @@ def Send(endpoints, send_vars, sync=True): """ assert (type(send_vars) == list) + if dummy_output is None: + dummy_output = [] + elif isinstance(dummy_output, Variable): + dummy_output = [dummy_output] + + assert (type(dummy_output) == list) + epmap = endpoints.split(",") endpoints = list(set(epmap)) @@ -232,6 +239,7 @@ def Send(endpoints, send_vars, sync=True): helper.append_op( type="send", inputs={"X": send_vars}, + outputs={"Out": dummy_output}, attrs={ "endpoints": endpoints, "epmap": epmap, @@ -241,7 +249,7 @@ def Send(endpoints, send_vars, sync=True): helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) -def Recv(endpoints, get_vars, sync=True): +def Recv(endpoints, get_vars, dummy_input=None, sync=True): """ Receive variables from server side @@ -256,13 +264,20 @@ def Recv(endpoints, get_vars, sync=True): """ assert (type(get_vars) == list) + if dummy_input is None: + dummy_input = [] + elif isinstance(dummy_input, Variable): + dummy_input = [dummy_input] + + assert (type(dummy_input) == list) + epmap = endpoints.split(",") endpoints = list(set(epmap)) helper = LayerHelper("Recv", **locals()) helper.append_op( type="recv", - inputs={"X": get_vars}, + inputs={"X": dummy_input}, outputs={"Out": get_vars}, attrs={"endpoints": endpoints, "epmap": epmap}) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index b0ee6ff9f5941b090e663dac0122cfb575f2f442..1307ba4e4ad11ef01094c44068d916ff2d442f78 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -16,7 +16,6 @@ from __future__ import print_function import numpy as np import argparse -import six import time import math diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 49d1b6c7b4e8fe353a4e849a407bc988183c96eb..57bc2e8a0ba173bb1273a5183340d0b618f0d73c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -34,6 +34,7 @@ import math import random import numpy as np import collections +import six from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework @@ -210,6 +211,9 @@ class DistributeTranspiler(object): ps_dispatcher = self.config.split_method(self.pserver_endpoints) self.has_distributed_lookup_table = self._has_distributed_lookup_table() + self.param_name_to_grad_name = dict() + for param_var, grad_var in self.params_grads: + self.param_name_to_grad_name[param_var.name] = grad_var.name # step 1: split and create vars, then put splited vars in dicts for later use. self._init_splited_vars() @@ -229,34 +233,39 @@ class DistributeTranspiler(object): random.seed(self.origin_program.random_seed) random.shuffle(grad_var_mapping_items) - for orig_varname, splited_vars in grad_var_mapping_items: + grad_name_to_send_dummy_out = dict() + for grad_varname, splited_vars in grad_var_mapping_items: eplist = ps_dispatcher.dispatch(splited_vars) if not self.config.slice_var_up: assert (len(splited_vars) == 1) + splited_grad_varname = grad_varname if len(splited_vars) == 1: - orig_varname = splited_vars[0].name + splited_grad_varname = splited_vars[0].name index = find_op_by_output_arg(program.global_block(), - orig_varname) + splited_grad_varname) elif len(splited_vars) > 1: - orig_var = program.global_block().vars[orig_varname] + orig_var = program.global_block().vars[splited_grad_varname] index = find_op_by_output_arg(program.global_block(), - orig_varname) + splited_grad_varname) self._insert_split_op(program, orig_var, index, splited_vars) index += 1 else: AssertionError("Can not insert the send op by original " - "variable name :", orig_varname) + "variable name :", splited_grad_varname) + dummy_output = program.global_block().create_var() + grad_name_to_send_dummy_out[grad_varname] = dummy_output program.global_block()._insert_op( index=index + 1, type="send", inputs={"X": splited_vars}, - outputs={}, + outputs={"Out": dummy_output}, attrs={ "epmap": eplist, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + "sync_mode": not self.sync_mode, }) for _, var in enumerate(splited_vars): send_vars.append(var) @@ -268,7 +277,6 @@ class DistributeTranspiler(object): outputs={}, attrs={ "endpoints": pserver_endpoints, - "sync_mode": self.sync_mode, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) @@ -284,19 +292,21 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) # step4: Concat the parameters splits together after recv. - for varname, splited_var in six.iteritems(self.param_var_mapping): + for param_varname, splited_var in six.iteritems(self.param_var_mapping): eps = [] for var in splited_var: index = [v.name for v in recv_vars].index(var.name) eps.append(eplist[index]) - + grad_send_dummy_out = grad_name_to_send_dummy_out[ + self.param_name_to_grad_name[param_varname]] program.global_block().append_op( type="recv", - inputs={}, + inputs={"X": [grad_send_dummy_out]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + "sync_mode": not self.sync_mode }) if self.sync_mode: @@ -309,10 +319,10 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) - for varname, splited_var in six.iteritems(self.param_var_mapping): + for param_varname, splited_var in six.iteritems(self.param_var_mapping): if len(splited_var) <= 1: continue - orig_param = program.global_block().vars[varname] + orig_param = program.global_block().vars[param_varname] program.global_block().append_op( type="concat", inputs={"X": splited_var}, @@ -380,7 +390,7 @@ class DistributeTranspiler(object): op = startup_program.global_block().append_op( type="recv", - inputs={}, + inputs={"X": []}, outputs={"Out": splited_var}, attrs={ "epmap": eps, @@ -786,19 +796,21 @@ class DistributeTranspiler(object): self.config.min_block_size) assert (len(grad_blocks) == len(param_blocks)) - # origin_varname -> [splited_var] + # origin_param_name -> [splited_param_vars] self.param_var_mapping = self._create_vars_from_blocklist( self.origin_program, param_blocks) + # origin_grad_name -> [splited_grad_vars] self.grad_var_mapping = self._create_vars_from_blocklist( self.origin_program, grad_blocks, add_trainer_suffix=self.trainer_num > 1) + # dict(grad_splited_var -> param_splited_var) self.grad_param_mapping = collections.OrderedDict() for g, p in zip(grad_blocks, param_blocks): g_name, g_bid, _ = g.split(":") p_name, p_bid, _ = p.split(":") self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ - self.param_var_mapping[p_name][int(p_bid)] + self.param_var_mapping[p_name][int(p_bid)] # create mapping of endpoint -> split var to create pserver side program self.param_grad_ep_mapping = collections.OrderedDict() @@ -919,7 +931,7 @@ class DistributeTranspiler(object): index=op_index + 2, type="send", inputs={'X': self.trainer_side_table_grad_list}, - outputs={}, + outputs={'Out': []}, attrs={ "sync_mode": True, "epmap": pserver_endpoints, diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index 987fc01cfc13bcefbb9d792a6780fdfff158755d..0d59e4c110ff8502acb4dbcda15f855f7652a946 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz freetype-devel libpng-devel graphviz +RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ bash build_scripts/install_nccl2.sh && rm -r build_scripts diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh index d99d3db2ed7e3a719f044d8117baeb7ac212b74d..eb4b477dcb538f7ba17cfc54057a97c9669a6916 100644 --- a/tools/manylinux1/build_scripts/build.sh +++ b/tools/manylinux1/build_scripts/build.sh @@ -28,7 +28,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel" # Libraries that are allowed as part of the manylinux1 profile -MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel" +MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel" # Get build utilities MY_DIR=$(dirname "${BASH_SOURCE[0]}")