提交 25adf970 编写于 作者: P peizhilin

Merge remote-tracking branch 'upstream/develop' into windows/build

...@@ -311,6 +311,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") ...@@ -311,6 +311,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
add_subdirectory(paddle) add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
...@@ -321,10 +329,3 @@ if(WITH_DOC) ...@@ -321,10 +329,3 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED) find_python_module(recommonmark REQUIRED)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
...@@ -15,8 +15,15 @@ limitations under the License. */ ...@@ -15,8 +15,15 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_set> #include <unordered_set>
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) { ...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes); graph_nodes.emplace_back(g_nodes);
} }
if (VLOG_IS_ON(100)) { if (FLAGS_print_sub_graph_dir.size()) {
VLOG(100) << "graph_num: " << graph_nodes.size(); if (graph_nodes.size() > 1) {
for (auto &g_n : graph_nodes) { std::stringstream out;
VLOG(100) << "graph_nodes: " << g_n.size(); for (auto &g_n : graph_nodes) {
if (g_n.size() < 10) { out << "graph_nodes: " << g_n.size() << "\n";
std::stringstream out; }
out << "\n\n";
for (auto &g_n : graph_nodes) {
out << "graph_nodes: " << g_n.size();
for (auto &node : g_n) { for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in ["; out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) { for (auto &n : node->inputs) {
...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) { ...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
} }
out << "]"; out << "]";
} }
VLOG(100) << out.str(); out << "\n\n\n";
} }
std::unique_ptr<std::ostream> fout(
new std::ofstream(FLAGS_print_sub_graph_dir));
PADDLE_ENFORCE(fout->good());
*fout << out.str();
} }
} }
......
...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
} }
// If the loss_var_name is given, the number of graph should be only one. // If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) { if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, size_t graph_num = ir::GraphNum(*graph);
"The number of graph should be only one"); if (graph_num > 1) {
LOG(WARNING)
<< "The number of graph should be only one, "
"but the current graph has "
<< ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
"please don't pass loss_var_name.";
}
} }
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
......
...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
for (size_t index = 0; index < block_desc.OpSize(); index++) { for (size_t index = 0; index < block_desc.OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index]; auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
......
...@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { ...@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"}); "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose"});
if (!node->IsOp()) return false; if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) { if (teller_set.count(node->Op()->Type())) {
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc pad_op.cc split_op.cc prelu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
...@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc ...@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
elementwise_add_op elementwise_mul_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
...@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc ...@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL) split_op concat_op SERIAL)
nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
prelu_op SERIAL) prelu_op SERIAL)
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
static bool CheckDims(const nvinfer1::Dims& dims_x,
const nvinfer1::Dims& dims_y) {
if (dims_x.nbDims != dims_y.nbDims) {
return false;
}
for (int i = 0; i < dims_x.nbDims; i++) {
if (dims_x.d[i] != dims_y.d[i]) {
return false;
}
}
return true;
}
class ElementwiseWeightOpConverter : public OpConverter { class ElementwiseWeightOpConverter : public OpConverter {
public: public:
ElementwiseWeightOpConverter() {} ElementwiseWeightOpConverter() {}
...@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
...@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
ElementwiseTensorOpConverter() {} ElementwiseTensorOpConverter() {}
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
auto op_pair = ops.find(op_type_);
PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
...@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_x = X->getDimensions();
nvinfer1::Dims dims_y = Y->getDimensions(); nvinfer1::Dims dims_y = Y->getDimensions();
// The two input tensor should have the same dims int axis = boost::get<int>(op_desc.GetAttr("axis"));
PADDLE_ENFORCE(dims_x.nbDims >= 3); auto output_name = op_desc.Output("Out")[0];
if (dims_x.nbDims == dims_y.nbDims) { if (CheckDims(dims_x, dims_y)) {
for (int i = 0; i < dims_x.nbDims; i++) { // The two input tensor should have the same dims
if (dims_x.d[i] != dims_y.d[i]) VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
} else {
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
auto op_pair = ops.find(op_type_); nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
if (op_pair == ops.end()) { engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
PADDLE_THROW("Wrong elementwise op type!"); *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
}
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0]; layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->setName(("elementwise (Output: " + output_name + ")").c_str()); layer->getOutput(0)->setName(output_name.c_str());
layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0));
engine_->SetITensor(output_name, layer->getOutput(0)); } else {
VLOG(3) << "Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer";
plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
plugin->AddInput(X);
plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
reinterpret_cast<plugin::PluginTensorRT*>(plugin));
layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
}
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
......
...@@ -61,7 +61,7 @@ class OpConverter { ...@@ -61,7 +61,7 @@ class OpConverter {
// TODO(xingzhaolong): all mul, sub, div // TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul", // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"}; // "sub", "div"};
static std::unordered_set<std::string> add_weight_op_set{"add"}; static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
int op_type_len = op_desc.Type().size(); int op_type_len = op_desc.Type().size();
std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
......
...@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter { ...@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data), static_cast<void*>(alpha_data),
alpha_tensor_device->numel()); alpha_tensor_device->numel());
PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode); plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
nvinfer1::IPluginLayer* layer = nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin); engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory // keep alpha tensor to avoid release it's memory
......
...@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter { ...@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
PADDLE_ENFORCE(output_lengths.size() == output_num); PADDLE_ENFORCE(output_lengths.size() == output_num);
// //
SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
nvinfer1::IPluginLayer* layer = nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin); engine_->AddPlugin(&input, input_num, plugin);
......
...@@ -20,13 +20,12 @@ namespace paddle { ...@@ -20,13 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
TEST(elementwise_op, add_weight_test) { TEST(elementwise_op, add_weight) {
std::unordered_set<std::string> parameters({"elementwise_add-Y"}); std::unordered_set<std::string> parameters({"elementwise_add-Y"});
framework::Scope scope; framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1 << 15); TRTConvertValidation validator(10, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description // Prepare Op description
...@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) { ...@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
validator.Execute(8); validator.Execute(8);
} }
TEST(elementwise_op, add_tensor_test) { TEST(elementwise_op, native) {
std::unordered_set<std::string> parameters; for (std::string type : {"add", "mul"}) {
framework::Scope scope; int batch_size = 8;
TRTConvertValidation validator(8, parameters, scope, 1 << 15); std::unordered_set<std::string> parameters;
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); framework::Scope scope;
validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3)); TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); validator.DeclInputVar("elementwise_" + type + "-X",
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_" + type + "-Y",
// Prepare Op description nvinfer1::Dims3(10, 3, 3));
framework::OpDesc desc; validator.DeclOutputVar("elementwise_" + type + "-Out",
desc.SetType("elementwise_add"); nvinfer1::DimsCHW(10, 3, 3));
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"}); // Prepare Op description
desc.SetOutput("Out", {"elementwise_add-Out"}); framework::OpDesc desc;
desc.SetType("elementwise_" + type);
// the defalut axis of elementwise op is -1 desc.SetInput("X", {"elementwise_" + type + "-X"});
desc.SetInput("Y", {"elementwise_" + type + "-Y"});
validator.SetOp(*desc.Proto()); desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
int axis = -1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(batch_size);
}
}
validator.Execute(8); TEST(elementwise_op, plugin) {
for (std::string type : {"add", "mul"}) {
int batch_size = 8;
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_" + type + "-X",
nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_" + type + "-Y",
nvinfer1::Dims3(10, 1, 1));
validator.DeclOutputVar("elementwise_" + type + "-Out",
nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_" + type);
desc.SetInput("X", {"elementwise_" + type + "-X"});
desc.SetInput("Y", {"elementwise_" + type + "-Y"});
desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
int axis = -1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(batch_size);
}
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(elementwise_add); USE_OP(elementwise_add);
USE_OP(elementwise_mul);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
......
...@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() { ...@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
} }
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { nvinfer1::ITensor *const *inputs, int num_inputs,
plugin::PluginTensorRT *plugin) {
owned_plugin_.emplace_back(plugin); owned_plugin_.emplace_back(plugin);
return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase { ...@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
int GetRuntimeBatch(); int GetRuntimeBatch();
int GetDevice() { return device_; } int GetDevice() { return device_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int nbInputs, PluginTensorRT*); int num_inputs, plugin::PluginTensorRT*);
// A pointer to CPU memory is needed of the TRT weight. // A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage. // Before TRT runs, fluid loads weight into GPU storage.
...@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase { ...@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
// The specific GPU id that the TensorRTEngine bounded to. // The specific GPU id that the TensorRTEngine bounded to.
int device_; int device_;
std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_; std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
// TensorRT related internal members // TensorRT related internal members
template <typename T> template <typename T>
......
nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
DEPS enforce device_context)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
namespace details {
template <typename T>
struct Add {
__device__ T operator()(const T& a, const T& b) const { return a + b; }
};
template <typename T>
struct Mul {
__device__ T operator()(const T& a, const T& b) const { return a * b; }
};
template <typename T, typename Operator>
__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
int batch_size, int num_rows, int num_cols) {
for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
int row = blockIdx.x;
for (; row < num_rows; row += gridDim.x) {
T value_y = y[batch_id * num_rows + row];
int col = threadIdx.x;
int offset = (batch_id * num_rows + row) * num_cols;
for (; col < num_cols; col += blockDim.x) {
T value_x = x[offset + col];
out[offset + col] = op(value_x, value_y);
}
}
}
}
template <typename T, typename Operator>
static void ElementWise(Operator op, const T* x, const T* y, T* out,
int batch_size, int prev, int midd, int post,
cudaStream_t stream) {
const int kThreadsPerBlock = 1024;
const int kMaximumBlocks = 65535;
if (prev == 1) {
int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
: (((post + 31) >> 5) << 5);
int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
op, x, y, out, batch_size, midd, post);
} else if (post == 1) {
PADDLE_THROW("Not implemented.");
} else {
PADDLE_THROW("Not implemented.");
}
}
} // namespace details
nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims* input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0);
PADDLE_ENFORCE_EQ(num_inputs, 2);
PADDLE_ENFORCE_NOT_NULL(input_dims);
return input_dims[0];
}
int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims;
for (; trimed_nb_dims > 0; --trimed_nb_dims) {
if (dims_y_.d[trimed_nb_dims - 1] != 1) {
break;
}
}
dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
prev_size_ = 1;
midd_size_ = 1;
post_size_ = 1;
for (int i = 0; i < axis_; ++i) {
prev_size_ *= dims_x_.d[i];
}
for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch.");
midd_size_ *= dims_y_.d[i];
}
for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
post_size_ *= dims_x_.d[i];
}
return 0;
}
int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
void** outputs, void* workspace,
cudaStream_t stream) {
const float* x = reinterpret_cast<const float*>(inputs[0]);
const float* y = reinterpret_cast<const float*>(inputs[1]);
float* out = reinterpret_cast<float*>(outputs[0]);
if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
details::ElementWise(details::Add<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else {
PADDLE_THROW("Not implemented.");
}
return cudaGetLastError() != cudaSuccess;
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class ElementWisePlugin : public PluginTensorRT {
public:
ElementWisePlugin(nvinfer1::ElementWiseOperation type,
nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
int axis)
: type_(type),
dims_x_(dims_x),
dims_y_(dims_y),
axis_(axis),
prev_size_(1),
midd_size_(1),
post_size_(1) {}
ElementWisePlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_);
}
ElementWisePlugin *clone() const override {
// return new ElementWisePlugin(dims_x_, dims_y_, axis_);
return nullptr;
}
const char *getPluginType() const override { return "elementwise"; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
int num_inputs) override;
int initialize() override;
// execute the layer
int enqueue(int batch_size, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream);
protected:
size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(dims_x_) +
SerializedSize(dims_y_) + getBaseSerializationSize();
}
void serialize(void *buffer) override {
serializeBase(buffer);
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_);
}
nvinfer1::ElementWiseOperation type_;
nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_;
int axis_;
int prev_size_;
int midd_size_;
int post_size_;
};
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
static const int CUDA_NUM_THREADS = 1024; static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535; static const int CUDA_MAX_NUM_BLOCKS = 65535;
...@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs, ...@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class PReluPlugin : public PluginTensorRT { class PReluPlugin : public PluginTensorRT {
TensorRTEngine::Weight alpha_; TensorRTEngine::Weight alpha_;
...@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT { ...@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
void *workspace, cudaStream_t stream) override; void *workspace, cudaStream_t stream) override;
}; };
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -14,10 +14,15 @@ ...@@ -14,10 +14,15 @@
#pragma once #pragma once
#include <cassert>
#include <cstring> #include <cstring>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
template <typename T> template <typename T>
inline void SerializeValue(void** buffer, T const& value); inline void SerializeValue(void** buffer, T const& value);
...@@ -26,7 +31,7 @@ template <typename T> ...@@ -26,7 +31,7 @@ template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size, inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value); T* value);
namespace { namespace details {
template <typename T, class Enable = void> template <typename T, class Enable = void>
struct Serializer {}; struct Serializer {};
...@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value || ...@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value || std::is_enum<T>::value ||
std::is_pod<T>::value>::type> { std::is_pod<T>::value>::type> {
static size_t SerializedSize(T const& value) { return sizeof(T); } static size_t SerializedSize(T const& value) { return sizeof(T); }
static void Serialize(void** buffer, T const& value) { static void Serialize(void** buffer, T const& value) {
std::memcpy(*buffer, &value, sizeof(T)); std::memcpy(*buffer, &value, sizeof(T));
reinterpret_cast<char*&>(*buffer) += sizeof(T); reinterpret_cast<char*&>(*buffer) += sizeof(T);
} }
static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
assert(*buffer_size >= sizeof(T)); assert(*buffer_size >= sizeof(T));
std::memcpy(value, *buffer, sizeof(T)); std::memcpy(value, *buffer, sizeof(T));
...@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value || ...@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
template <> template <>
struct Serializer<const char*> { struct Serializer<const char*> {
static size_t SerializedSize(const char* value) { return strlen(value) + 1; } static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
static void Serialize(void** buffer, const char* value) { static void Serialize(void** buffer, const char* value) {
std::strcpy(static_cast<char*>(*buffer), value); std::strcpy(static_cast<char*>(*buffer), value); // NOLINT
reinterpret_cast<char*&>(*buffer) += strlen(value) + 1; reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
} }
static void Deserialize(void const** buffer, size_t* buffer_size, static void Deserialize(void const** buffer, size_t* buffer_size,
const char** value) { const char** value) {
*value = static_cast<char const*>(*buffer); *value = static_cast<char const*>(*buffer);
...@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>, ...@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
static size_t SerializedSize(std::vector<T> const& value) { static size_t SerializedSize(std::vector<T> const& value) {
return sizeof(value.size()) + value.size() * sizeof(T); return sizeof(value.size()) + value.size() * sizeof(T);
} }
static void Serialize(void** buffer, std::vector<T> const& value) { static void Serialize(void** buffer, std::vector<T> const& value) {
SerializeValue(buffer, value.size()); SerializeValue(buffer, value.size());
size_t nbyte = value.size() * sizeof(T); size_t nbyte = value.size() * sizeof(T);
std::memcpy(*buffer, value.data(), nbyte); std::memcpy(*buffer, value.data(), nbyte);
reinterpret_cast<char*&>(*buffer) += nbyte; reinterpret_cast<char*&>(*buffer) += nbyte;
} }
static void Deserialize(void const** buffer, size_t* buffer_size, static void Deserialize(void const** buffer, size_t* buffer_size,
std::vector<T>* value) { std::vector<T>* value) {
size_t size; size_t size;
DeserializeValue(buffer, buffer_size, &size); DeserializeValue(buffer, buffer_size, &size);
value->resize(size); value->resize(size);
size_t nbyte = value->size() * sizeof(T); size_t nbyte = value->size() * sizeof(T);
assert(*buffer_size >= nbyte); PADDLE_ENFORCE_GE(*buffer_size, nbyte);
std::memcpy(value->data(), *buffer, nbyte); std::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte; reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte; *buffer_size -= nbyte;
} }
}; };
} // namespace } // namespace details
template <typename T> template <typename T>
inline size_t SerializedSize(T const& value) { inline size_t SerializedSize(T const& value) {
return Serializer<T>::SerializedSize(value); return details::Serializer<T>::SerializedSize(value);
} }
template <typename T> template <typename T>
inline void SerializeValue(void** buffer, T const& value) { inline void SerializeValue(void** buffer, T const& value) {
return Serializer<T>::Serialize(buffer, value); return details::Serializer<T>::Serialize(buffer, value);
} }
template <typename T> template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size, inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value) { T* value) {
return Serializer<T>::Deserialize(buffer, buffer_size, value); return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
} }
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -12,26 +12,26 @@ ...@@ -12,26 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <stdio.h>
#include <cassert>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, nvinfer1::Dims SplitPlugin::getOutputDimensions(
const nvinfer1::Dims* inputDims, int index, const nvinfer1::Dims* input_dims, int num_inputs) {
int nbInputs) { PADDLE_ENFORCE_EQ(num_inputs, 1);
assert(nbInputs == 1); PADDLE_ENFORCE_LT(index, this->getNbOutputs());
assert(index < this->getNbOutputs());
nvinfer1::Dims const& input_dims = inputDims[0]; nvinfer1::Dims output_dims = input_dims[0];
nvinfer1::Dims output_dims = input_dims;
output_dims.d[axis_] = output_length_.at(index); output_dims.d[axis_] = output_length_.at(index);
return output_dims; return output_dims;
} }
int SplitPlugin::initialize() { int SplitPlugin::initialize() {
PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
std::vector<int> segment_offsets(1, 0); std::vector<int> segment_offsets(1, 0);
for (int i = 0; i < this->getNbOutputs(); ++i) { for (int i = 0; i < this->getNbOutputs(); ++i) {
segment_offsets.push_back(segment_offsets.back() + output_length_[i]); segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
...@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
} // tensorrt } // namespace plugin
} // inference } // namespace tensorrt
} // paddle } // namespace inference
} // namespace paddle
...@@ -14,61 +14,58 @@ ...@@ -14,61 +14,58 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class SplitPlugin : public PluginTensorRT { class SplitPlugin : public PluginTensorRT {
int axis_; public:
std::vector<int> output_length_; SplitPlugin(int axis, std::vector<int> const &output_lengths)
int nx_, ny_, nz_; : axis_(axis), output_length_(output_lengths) {}
std::vector<int> segment_offsets_;
SplitPlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &output_length_);
}
SplitPlugin *clone() const override {
return new SplitPlugin(axis_, output_length_);
}
const char *getPluginType() const override { return "split"; }
int getNbOutputs() const override { return output_length_.size(); }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
int num_inputs) override;
int initialize() override;
int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
protected: protected:
virtual size_t getSerializationSize() override { size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(output_length_) + return SerializedSize(axis_) + SerializedSize(output_length_) +
getBaseSerializationSize(); getBaseSerializationSize();
} }
// TRT will call this func when we need to serialize the configuration of void serialize(void *buffer) override {
// tensorrt.
// It should not be called by users.
virtual void serialize(void *buffer) override {
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, axis_); SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_); SerializeValue(&buffer, output_length_);
} }
public: int axis_;
SplitPlugin(int axis, std::vector<int> const &output_lengths) std::vector<int> output_length_;
: axis_(axis), output_length_(output_lengths) { int nx_, ny_, nz_;
assert(axis <= nvinfer1::Dims::MAX_DIMS); std::vector<int> segment_offsets_;
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin(void const *serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &axis_);
DeserializeValue(&serialData, &serialLength, &output_length_);
}
SplitPlugin *clone() const override {
return new SplitPlugin(axis_, output_length_);
}
virtual const char *getPluginType() const override { return "split"; }
virtual int getNbOutputs() const override { return output_length_.size(); }
virtual nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *inputs,
int nbInputDims) override;
virtual int initialize() override;
virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
}; };
} // tensorrt } // namespace plugin
} // inference } // namespace tensorrt
} // paddle } // namespace inference
} // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
void PluginTensorRT::serializeBase(void*& buffer) { void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, input_dims_); SerializeValue(&buffer, input_dims_);
...@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) { ...@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, data_format_); SerializeValue(&buffer, data_format_);
} }
void PluginTensorRT::deserializeBase(void const*& serialData, void PluginTensorRT::deserializeBase(void const*& serial_data,
size_t& serialLength) { size_t& serial_length) {
DeserializeValue(&serialData, &serialLength, &input_dims_); DeserializeValue(&serial_data, &serial_length, &input_dims_);
DeserializeValue(&serialData, &serialLength, &max_batch_size_); DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
DeserializeValue(&serialData, &serialLength, &data_type_); DeserializeValue(&serial_data, &serial_length, &data_type_);
DeserializeValue(&serialData, &serialLength, &data_format_); DeserializeValue(&serial_data, &serial_length, &data_format_);
} }
size_t PluginTensorRT::getBaseSerializationSize() { size_t PluginTensorRT::getBaseSerializationSize() {
...@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, ...@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
} }
void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims, void PluginTensorRT::configureWithFormat(
int nbInputs, const nvinfer1::Dims* input_dims, int num_inputs,
const nvinfer1::Dims* outputDims, const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int max_batch_size) {
nvinfer1::PluginFormat format,
int maxBatchSize) {
data_type_ = type; data_type_ = type;
data_format_ = format; data_format_ = format;
input_dims_.assign(inputDims, inputDims + nbInputs); input_dims_.assign(input_dims, input_dims + num_inputs);
max_batch_size_ = maxBatchSize; max_batch_size_ = max_batch_size;
} }
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -14,23 +14,30 @@ ...@@ -14,23 +14,30 @@
#pragma once #pragma once
#include <cassert> #include <NvInfer.h>
#include <cstring> #include <cstring>
#include <iostream>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class PluginTensorRT : public nvinfer1::IPluginExt { class PluginTensorRT : public nvinfer1::IPluginExt {
public: public:
PluginTensorRT() {} PluginTensorRT() {}
// It was used for TensorRT deserialization.
// It should not be called by users.
PluginTensorRT(const void* serialized_data, size_t length) {} PluginTensorRT(const void* serialized_data, size_t length) {}
virtual ~PluginTensorRT() {}
nvinfer1::Dims const& getInputDims(int index) const { nvinfer1::Dims const& getInputDims(int index) const {
return input_dims_.at(index); return input_dims_.at(index);
} }
...@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt { ...@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
nvinfer1::DataType getDataType() const { return data_type_; } nvinfer1::DataType getDataType() const { return data_type_; }
nvinfer1::PluginFormat getDataFormat() const { return data_format_; } nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
virtual const char* getPluginVersion() const { return "1"; } virtual const char* getPluginVersion() const { return "1"; }
void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
virtual nvinfer1::IPluginExt* clone() const = 0;
virtual const char* getPluginType() const = 0;
// Following functions are inherit from nvinfer1::IPluginExt
// Get the number of outputs from the layer
int getNbOutputs() const { return 1; }
// Get the dimension of an output tensor
virtual nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* input_dims,
int num_inputs) = 0;
// Find the workspace size required by the layer
size_t getWorkspaceSize(int) const override { return 0; } size_t getWorkspaceSize(int) const override { return 0; }
// Initialize the layer for execution.
// This is called when the engine is created.
int initialize() override { return 0; }
// Shutdown the layer. This is called when the engine is destroyed
void terminate() override {} void terminate() override {}
virtual ~PluginTensorRT() {} // Execute the layer
virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) = 0;
// Find the size of the serialization buffer required
virtual size_t getSerializationSize() = 0;
// Serialize the layer config to buffer.
// TensorRT will call this func to serialize the configuration of TensorRT
// engine. It should not be called by users.
virtual void serialize(void* buffer) = 0;
// Check format support. The default is FLOAT32 and NCHW. // Check format support. The default is FLOAT32 and NCHW.
bool supportsFormat(nvinfer1::DataType type, bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const override; nvinfer1::PluginFormat format) const override;
void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, // Configure the layer
const nvinfer1::Dims* outputDims, int nbOutputs, void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
const nvinfer1::Dims* output_dims, int num_outputs,
nvinfer1::DataType type, nvinfer1::DataType type,
nvinfer1::PluginFormat format, nvinfer1::PluginFormat format,
int maxBatchSize) override; int max_batch_size) override;
// *NOTE* The following functions need to be overrided in the subclass.
virtual nvinfer1::IPluginExt* clone() const = 0;
virtual const char* getPluginType() const = 0;
// Initialize the layer for execution. This is called when the engine is
// created.
int initialize() override { return 0; }
// Serialize the layer config to buffer.
virtual void serialize(void* buffer) = 0;
virtual size_t getSerializationSize() = 0;
virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) = 0;
protected: protected:
// Deserialize input_dims, max_batch_size, data_type, data_format // Deserialize input_dims, max_batch_size, data_type, data_format
void deserializeBase(void const*& serialData, size_t& serialLength); void deserializeBase(void const*& serial_data, // NOLINT
size_t& serial_length); // NOLINT
size_t getBaseSerializationSize(); size_t getBaseSerializationSize();
// Serialize input_dims, max_batch_size, data_type, data_format // Serialize input_dims, max_batch_size, data_type, data_format
void serializeBase(void*& buffer); void serializeBase(void*& buffer); // NOLINT
std::vector<nvinfer1::Dims> input_dims_; std::vector<nvinfer1::Dims> input_dims_;
size_t max_batch_size_; size_t max_batch_size_;
nvinfer1::DataType data_type_; nvinfer1::DataType data_type_;
nvinfer1::PluginFormat data_format_; nvinfer1::PluginFormat data_format_;
std::vector<nvinfer1::ITensor*> inputs_;
}; };
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { ...@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config); LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
return; return;
} }
LOG(INFO) << *config; LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
......
...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h" #include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,257 +59,83 @@ void VXXJitCode::generate() { ...@@ -60,257 +59,83 @@ void VXXJitCode::generate() {
offset += sizeof(float) * YMM_FLOAT_BLOCK; offset += sizeof(float) * YMM_FLOAT_BLOCK;
} }
int rest = num_ % YMM_FLOAT_BLOCK; int rest = num_ % YMM_FLOAT_BLOCK;
if (rest >= 4) { while (rest > 0) {
if (scalar_index_ != 1) { int block = XMM_FLOAT_BLOCK;
vmovups(xmm_src1, ptr[param1 + offset]); if (rest >= 4) {
} block = 4;
if (scalar_index_ != 2) { if (scalar_index_ != 1) {
vmovups(xmm_src2, ptr[param2 + offset]); vmovups(xmm_src1, ptr[param1 + offset]);
} }
if (type_ == operand_type::mul) { if (scalar_index_ != 2) {
vmulps(xmm_dst, xmm_src1, xmm_src2); vmovups(xmm_src2, ptr[param2 + offset]);
} else if (type_ == operand_type::add) { }
vaddps(xmm_dst, xmm_src1, xmm_src2); } else if (rest >= 2) {
} block = 2;
if (with_relu_) { if (scalar_index_ != 1) {
vmaxps(xmm_dst, xmm_zero, xmm_dst); vmovq(xmm_src1, ptr[param1 + offset]);
} }
vmovups(ptr[param3 + offset], xmm_dst); if (scalar_index_ != 2) {
offset += sizeof(float) * 4; vmovq(xmm_src2, ptr[param2 + offset]);
rest -= 4; }
} } else {
if (rest >= 2) { block = 1;
if (scalar_index_ != 1) { if (scalar_index_ != 1) {
vmovups(xmm_src1, ptr[param1 + offset]); vmovss(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovss(xmm_src2, ptr[param2 + offset]);
}
} }
if (scalar_index_ != 2) { switch (type_) {
vmovups(xmm_src2, ptr[param2 + offset]); case operand_type::mul:
} vmulps(xmm_dst, xmm_src1, xmm_src2);
if (type_ == operand_type::mul) { break;
vmulps(xmm_dst, xmm_src1, xmm_src2); case operand_type::add:
} else if (type_ == operand_type::add) { vaddps(xmm_dst, xmm_src1, xmm_src2);
vaddps(xmm_dst, xmm_src1, xmm_src2); break;
default:
break;
} }
if (with_relu_) { if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst); vmaxps(xmm_dst, xmm_zero, xmm_dst);
} }
vmovq(ptr[param3 + offset], xmm_dst); if (rest >= 4) {
offset += sizeof(float) * 2; vmovups(ptr[param3 + offset], xmm_dst);
rest -= 2; } else if (rest >= 2) {
} vmovq(ptr[param3 + offset], xmm_dst);
if (rest > 0) { } else {
if (scalar_index_ != 1) { vmovss(ptr[param3 + offset], xmm_dst);
vmovups(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovups(xmm_src2, ptr[param2 + offset]);
}
if (type_ == operand_type::mul) {
vmulss(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddss(xmm_dst, xmm_src1, xmm_src2);
} }
if (with_relu_) { offset += sizeof(float) * block;
vmaxps(xmm_dst, xmm_zero, xmm_dst); rest -= block;
}
vmovss(ptr[param3 + offset], xmm_dst);
} }
ret(); ret();
} }
#define ALIGN32 __attribute__((aligned(32))) const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
#define EXP_HIG 88.3762626647949f REPEAT_8TIMES(2.f),
#define EXP_LOW -88.3762626647949f REPEAT_8TIMES(0.5f),
#define CEPHES_LOG2EF 1.44269504088896341 REPEAT_8TIMES(EXP_HIG),
#define CEPHES_EXP_C1 0.693359375 REPEAT_8TIMES(EXP_LOW),
#define CEPHES_EXP_C2 -2.12194440e-4 REPEAT_8TIMES(CEPHES_LOG2EF),
#define CEPHES_EXP_P0 1.9875691500E-4 REPEAT_8TIMES(CEPHES_EXP_C1),
#define CEPHES_EXP_P1 1.3981999507E-3 REPEAT_8TIMES(CEPHES_EXP_C2),
#define CEPHES_EXP_P2 8.3334519073E-3 REPEAT_8TIMES(CEPHES_EXP_P0),
#define CEPHES_EXP_P3 4.1665795894E-2 REPEAT_8TIMES(CEPHES_EXP_P1),
#define CEPHES_EXP_P4 1.6666665459E-1 REPEAT_8TIMES(CEPHES_EXP_P2),
#define CEPHES_EXP_P5 5.0000001201E-1 REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) int g_tmp_mem[16] ALIGN32 = {0};
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
static const float exp_float_consts[] ALIGN32 = {
REPEAT_8TIMES(1.f),
REPEAT_8TIMES(2.f),
REPEAT_8TIMES(0.5f),
REPEAT_8TIMES(EXP_HIG),
REPEAT_8TIMES(EXP_LOW),
REPEAT_8TIMES(CEPHES_LOG2EF),
REPEAT_8TIMES(CEPHES_EXP_C1),
REPEAT_8TIMES(CEPHES_EXP_C2),
REPEAT_8TIMES(CEPHES_EXP_P0),
REPEAT_8TIMES(CEPHES_EXP_P1),
REPEAT_8TIMES(CEPHES_EXP_P2),
REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
static int g_tmp_mem[16] ALIGN32 = {0};
bool VActJitCode::init(int d, operand_type type) { bool VActJitCode::init(int d, operand_type type) {
bool ok = MayIUse(avx); // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
if (type == operand_type::relu) { return MayIUse(avx);
return ok;
} else if (type == operand_type::exp) {
// exp is slower than mkl when d >= 256
return ok && d % 8 == 0 && d < 256;
} else {
// TODO(TJ): support more
return ok && d % 8 == 0;
}
}
void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
vmaxps(ymm_dst, ymm_zero, ymm_src);
}
void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore
// check all idx can not equal
ymm_t ymm_fx = ymm_t(fx_idx);
ymm_t ymm_fy = ymm_t(fy_idx);
ymm_t ymm_mask = ymm_t(mask_idx);
ymm_t ymm_tmp = ymm_t(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
vminps(ymm_src, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
vmaxps(ymm_src, ymm_src, ymm_tmp);
// express exp(x) as exp(g + n*log(2))
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
vmulps(ymm_fx, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
vaddps(ymm_fx, ymm_fx, ymm_tmp);
vroundps(ymm_fy, ymm_fx, 0x01);
// if greater, substract 1
vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
vmovaps(ymm_tmp, ptr[reg_ptr_global]);
vandps(ymm_mask, ymm_mask, ymm_tmp);
vsubps(ymm_fx, ymm_fy, ymm_mask);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
vmulps(ymm_fy, ymm_fx, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
vmulps(ymm_z, ymm_fx, ymm_tmp);
vsubps(ymm_src, ymm_src, ymm_fy);
vsubps(ymm_src, ymm_src, ymm_z);
vmulps(ymm_z, ymm_src, ymm_src);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
vmulps(ymm_dst, ymm_src, ymm_tmp);
for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
i += (YMM_FLOAT_BLOCK * sizeof(float))) {
vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmulps(ymm_dst, ymm_dst, ymm_src);
}
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmulps(ymm_dst, ymm_dst, ymm_z);
vaddps(ymm_dst, ymm_dst, ymm_src);
vmovaps(ymm_tmp, ptr[reg_ptr_global]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
// build 2^n
ymm_t ymm_int = ymm_fx;
vcvttps2dq(ymm_int, ymm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2)) {
vpaddd(ymm_int, ymm_int, ymm_tmp);
vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global;
mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
vmovdqa(ptr[reg_ptr_tmp], ymm_int);
vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp], xtmp1);
// next 128bits
vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
vmovdqa(xtmp2,
ptr[reg_ptr_tmp +
(YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
// load out
vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
}
vmulps(ymm_dst, ymm_dst, ymm_int);
pop(reg_ptr_global);
}
void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
// y = 1 / (1 + e^-x)
ymm_t ymm_tmp = ymm_t(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
vminps(ymm_src, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
vmaxps(ymm_src, ymm_src, ymm_tmp);
vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
vsubps(ymm_src, ymm_tmp, ymm_src);
exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vdivps(ymm_dst, ymm_tmp, ymm_dst);
pop(reg_ptr_global);
}
void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
// y = 2 / (1 + e^(-2x)) - 1
ymm_t ymm_tmp = ymm_t(tmp_idx);
ymm_t ymm_zero = ymm_t(mask_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vxorps(ymm_zero, ymm_zero, ymm_zero);
vsubps(ymm_tmp, ymm_zero, ymm_tmp);
vmulps(ymm_src, ymm_src, ymm_tmp);
exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vdivps(ymm_dst, ymm_tmp, ymm_dst);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vsubps(ymm_dst, ymm_dst, ymm_tmp);
pop(reg_ptr_global);
} }
void VActJitCode::generate() { void VActJitCode::generate() {
...@@ -324,16 +149,16 @@ void VActJitCode::generate() { ...@@ -324,16 +149,16 @@ void VActJitCode::generate() {
vmovups(ymm_src, ptr[param1 + offset]); vmovups(ymm_src, ptr[param1 + offset]);
switch (type_) { switch (type_) {
case operand_type::relu: case operand_type::relu:
relu_ymm(ymm_dst, ymm_src, ymm_zero); relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
break; break;
case operand_type::exp: case operand_type::exp:
exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::sigmoid: case operand_type::sigmoid:
sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::tanh: case operand_type::tanh:
tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::identity: case operand_type::identity:
break; break;
...@@ -343,30 +168,44 @@ void VActJitCode::generate() { ...@@ -343,30 +168,44 @@ void VActJitCode::generate() {
vmovups(ptr[param2 + offset], ymm_dst); vmovups(ptr[param2 + offset], ymm_dst);
offset += sizeof(float) * YMM_FLOAT_BLOCK; offset += sizeof(float) * YMM_FLOAT_BLOCK;
} }
if (type_ != operand_type::relu) {
// TODO(TJ): remove me
ret();
return;
}
int rest = num_ % YMM_FLOAT_BLOCK; int rest = num_ % YMM_FLOAT_BLOCK;
if (rest >= 4) { while (rest > 0) {
vmovups(xmm_src, ptr[param1 + offset]); int block = XMM_FLOAT_BLOCK;
vmaxps(xmm_dst, xmm_zero, xmm_src); if (rest >= 4) {
vmovups(ptr[param2 + offset], xmm_dst); block = 4;
offset += sizeof(float) * 4; vmovups(xmm_src, ptr[param1 + offset]);
rest -= 4; } else if (rest >= 2) {
} block = 2;
if (rest >= 2) { vmovq(xmm_src, ptr[param1 + offset]);
vmovups(xmm_src, ptr[param1 + offset]); } else {
vmaxps(xmm_dst, xmm_zero, xmm_src); block = 1;
vmovq(ptr[param2 + offset], xmm_dst); vmovss(xmm_src, ptr[param1 + offset]);
offset += sizeof(float) * 2; }
rest -= 2; switch (type_) {
} case operand_type::relu:
if (rest > 0) { relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
vmovups(xmm_src, ptr[param1 + offset]); break;
vmaxps(xmm_dst, xmm_zero, xmm_src); case operand_type::exp:
vmovss(ptr[param2 + offset], xmm_dst); exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
case operand_type::sigmoid:
sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
case operand_type::tanh:
tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
default:
break;
}
if (rest >= 4) {
vmovups(ptr[param2 + offset], xmm_dst);
} else if (rest >= 2) {
vmovq(ptr[param2 + offset], xmm_dst);
} else {
vmovss(ptr[param2 + offset], xmm_dst);
}
offset += sizeof(float) * block;
rest -= block;
} }
ret(); ret();
} }
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/jit_gen.h" #include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -40,6 +42,51 @@ typedef enum { ...@@ -40,6 +42,51 @@ typedef enum {
identity identity
} operand_type; } operand_type;
extern const float exp_float_consts[];
extern const int exp_int_0x7f[];
extern int g_tmp_mem[];
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
class VXXJitCode : public JitCode { class VXXJitCode : public JitCode {
public: public:
...@@ -127,21 +174,140 @@ class VActJitCode : public JitCode { ...@@ -127,21 +174,140 @@ class VActJitCode : public JitCode {
void generate() override; void generate() override;
protected: protected:
// compute relu with ymm // compute relu with ymm, xmm
void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, template <typename JMM>
const Xbyak::Ymm& zero); void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT
vmaxps(dst, src, zero);
}
// compute exp with ymm // compute exp with ymm, xmm
void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT
int mask_idx = 4, int tmp_idx = 5) {
using namespace platform::jit; // NOLINT
assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore
// check all idx can not equal
JMM jmm_fx = JMM(fx_idx);
JMM jmm_fy = JMM(fy_idx);
JMM jmm_mask = JMM(mask_idx);
JMM jmm_tmp = JMM(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
vminps(src, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
vmaxps(src, src, jmm_tmp);
// express exp(x) as exp(g + n*log(2))
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
vmulps(jmm_fx, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
vaddps(jmm_fx, jmm_fx, jmm_tmp);
vroundps(jmm_fy, jmm_fx, 0x01);
// if greater, substract 1
vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
vmovaps(jmm_tmp, ptr[reg_ptr_global]);
vandps(jmm_mask, jmm_mask, jmm_tmp);
vsubps(jmm_fx, jmm_fy, jmm_mask);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
vmulps(jmm_fy, jmm_fx, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
JMM ymm_z = JMM(jmm_mask.getIdx());
vmulps(ymm_z, jmm_fx, jmm_tmp);
vsubps(src, src, jmm_fy);
vsubps(src, src, ymm_z);
vmulps(ymm_z, src, src);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
vmulps(dst, src, jmm_tmp);
for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
i += (YMM_FLOAT_BLOCK * sizeof(float))) {
vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4
vaddps(dst, dst, jmm_tmp);
vmulps(dst, dst, src);
}
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
vaddps(dst, dst, jmm_tmp);
vmulps(dst, dst, ymm_z);
vaddps(dst, dst, src);
vmovaps(jmm_tmp, ptr[reg_ptr_global]);
vaddps(dst, dst, jmm_tmp);
// build 2^n
JMM ymm_int = jmm_fx;
vcvttps2dq(ymm_int, jmm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
vpaddd(ymm_int, ymm_int, jmm_tmp);
vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global;
mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
vmovdqa(ptr[reg_ptr_tmp], ymm_int);
vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp], xtmp1);
// next 128bits
vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
vmovdqa(xtmp2, ptr[reg_ptr_tmp +
(YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
// load out
vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
}
vmulps(dst, dst, ymm_int);
pop(reg_ptr_global);
}
// compute sigmoid with ymm // compute sigmoid with ymm, xmm
void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
// y = 1 / (1 + e^-x)
JMM jmm_tmp = JMM(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
vminps(src, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
vmaxps(src, src, jmm_tmp);
vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
vsubps(src, jmm_tmp, src);
exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(dst, dst, jmm_tmp);
vdivps(dst, jmm_tmp, dst);
pop(reg_ptr_global);
}
// compute tanh with ymm // compute tanh with ymm, xmm
void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT
int mask_idx = 4, int tmp_idx = 5) {
// y = 2 / (1 + e^(-2x)) - 1
JMM jmm_tmp = JMM(tmp_idx);
JMM jmm_zero = JMM(mask_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vxorps(jmm_zero, jmm_zero, jmm_zero);
vsubps(jmm_tmp, jmm_zero, jmm_tmp);
vmulps(src, src, jmm_tmp);
exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(dst, dst, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vdivps(dst, jmm_tmp, dst);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vsubps(dst, dst, jmm_tmp);
pop(reg_ptr_global);
}
protected: protected:
int num_; int num_;
......
...@@ -26,6 +26,7 @@ namespace operators { ...@@ -26,6 +26,7 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0 #define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0 #define EXP_MAX_INPUT 40.0
......
...@@ -33,6 +33,9 @@ limitations under the License. */ ...@@ -33,6 +33,9 @@ limitations under the License. */
constexpr int repeat = 20000; constexpr int repeat = 20000;
// TODO(TJ): benchmark and test should be seperated,
// benchmark should verify more sizes
inline double GetCurrentUS() { inline double GetCurrentUS() {
struct timeval time; struct timeval time;
gettimeofday(&time, NULL); gettimeofday(&time, NULL);
...@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { ...@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
TEST(JitKernel, vrelu) { TEST(JitKernel, vrelu) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) { for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -10.f, 1.f); RandomVec<float>(d, x.data(), -10.f, 1.f);
...@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) { ...@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST(JitKernel, vexp) { TEST(JitKernel, vexp) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 128, 256}) { for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -231,7 +234,7 @@ void vsigmoid_better( ...@@ -231,7 +234,7 @@ void vsigmoid_better(
TEST(JitKernel, vsigmoid) { TEST(JitKernel, vsigmoid) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -295,7 +298,7 @@ void vtanh_better( ...@@ -295,7 +298,7 @@ void vtanh_better(
TEST(JitKernel, vtanh) { TEST(JitKernel, vtanh) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -386,7 +389,7 @@ void lstm_ctht_better( ...@@ -386,7 +389,7 @@ void lstm_ctht_better(
TEST(JitKernel, lstm) { TEST(JitKernel, lstm) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
int d4 = d * 4; int d4 = d * 4;
int d3 = d * 3; int d3 = d * 3;
std::vector<float> x(d4), xref(d4); std::vector<float> x(d4), xref(d4);
...@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) { ...@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
float* zref_data = zref.data(); float* zref_data = zref.data();
auto trefs = GetCurrentUS(); auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
vadd_ref(d, x_data, y_data, zref_data); vaddrelu_ref(d, x_data, y_data, zref_data);
} }
auto trefe = GetCurrentUS(); auto trefe = GetCurrentUS();
auto tmkls = GetCurrentUS(); auto tmkls = GetCurrentUS();
......
...@@ -19,7 +19,8 @@ namespace paddle { ...@@ -19,7 +19,8 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceContext, typename T, bool is_test> template <typename DeviceContext, typename T, bool is_test,
typename Enable = void>
class SoftmaxFunctor { class SoftmaxFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -32,8 +33,8 @@ struct ValueClip { ...@@ -32,8 +33,8 @@ struct ValueClip {
} }
}; };
template <typename DeviceContext, typename T, bool is_test> template <typename DeviceContext, typename T, bool is_test, typename Enable>
void SoftmaxFunctor<DeviceContext, T, is_test>::operator()( void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
const DeviceContext& context, const framework::Tensor* X, const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X); auto logits = EigenMatrix<T>::From(*X);
...@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()( ...@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
.broadcast(one_by_class)); .broadcast(one_by_class));
} }
template <typename DeviceContext, typename T> template <class DeviceContext>
class SoftmaxFunctor<DeviceContext, T, true> { using enable_if_CPU = typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
template <typename DeviceContext>
class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X); auto in_dims = X->dims();
auto softmax = EigenMatrix<T>::From(*Y); auto out_dims = Y->dims();
const float* in_data = X->data<float>();
float* out_data = Y->data<float>();
const int kBatchDim = 0; const int kBatchDim = 0;
const int kClassDim = 1; const int kClassDim = 1;
// 2D data. Batch x C
const int batch_size = logits.dimension(kBatchDim); const int batch_size = in_dims[kBatchDim];
const int num_classes = logits.dimension(kClassDim); const int num_classes = in_dims[kClassDim];
std::vector<float> entities(batch_size);
Eigen::DSizes<int, 1> along_class(kClassDim); auto blas = math::GetBlas<DeviceContext, float>(context);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1); for (int n = 0; n < batch_size; ++n) {
Eigen::DSizes<int, 2> one_by_class(1, num_classes); entities[n] = in_data[n * num_classes];
for (int c = 1; c < num_classes; ++c) {
auto shifted_logits = (logits - entities[n] = in_data[n * num_classes + c] > entities[n]
logits.maximum(along_class) ? in_data[n * num_classes + c]
.eval() : entities[n];
.reshape(batch_by_one) }
.broadcast(one_by_class)); for (int c = 0; c < num_classes; ++c) {
out_data[n * num_classes + c] =
softmax.device(*context.eigen_device()) = shifted_logits.exp(); in_data[n * num_classes + c] - entities[n];
softmax.device(*context.eigen_device()) = (softmax * }
softmax.sum(along_class) }
.inverse()
.eval() blas.VEXP(num_classes * batch_size, out_data, out_data);
.reshape(batch_by_one) for (int n = 0; n < batch_size; ++n) {
.broadcast(one_by_class)); entities[n] = out_data[n * num_classes];
for (int c = 1; c < num_classes; ++c) {
entities[n] += out_data[n * num_classes + c];
}
blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
}
} }
}; };
......
...@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> { ...@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
#ifdef ON_INFER #ifdef PADDLE_ON_INFERENCE
math::SoftmaxFunctor<DeviceContext, T, true>()( math::SoftmaxFunctor<
DeviceContext, T,
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d); context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
#else #else
math::SoftmaxFunctor<DeviceContext, T, false>()( math::SoftmaxFunctor<DeviceContext, T, false>()(
......
...@@ -147,20 +147,32 @@ class StackKernel : public framework::OpKernel<T> { ...@@ -147,20 +147,32 @@ class StackKernel : public framework::OpKernel<T> {
auto &dim = x[0]->dims(); auto &dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i]; for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
int total_num = pre * n * post;
auto &dev_ctx = ctx.template device_context<DeviceContext>();
#ifdef __NVCC__ #ifdef __NVCC__
int total_num = pre * n * post;
auto &dev_ctx = ctx.template device_context<DeviceContext>();
thrust::device_vector<const T *> device_x_vec(x_datas); thrust::device_vector<const T *> device_x_vec(x_datas);
auto x_data_arr = device_x_vec.data().get(); auto x_data_arr = device_x_vec.data().get();
#else
auto x_data_arr = x_datas.data();
#endif
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
#ifdef __NVCC__
// Wait() must be called because device_x_vec may be destructed before // Wait() must be called because device_x_vec may be destructed before
// kernel ends // kernel ends
dev_ctx.Wait(); dev_ctx.Wait();
#else
auto x_data_arr = x_datas.data();
size_t x_offset = 0;
size_t y_offset = 0;
for (int i = 0; i < pre; i++) {
for (int j = 0; j < n; j++) {
std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
post * sizeof(T));
y_offset += post;
}
x_offset += post;
}
#endif #endif
} }
}; };
......
...@@ -117,7 +117,8 @@ def __bootstrap__(): ...@@ -117,7 +117,8 @@ def __bootstrap__():
'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'eager_delete_tensor_gb', "dist_threadpool_size", 'eager_delete_tensor_gb',
'allocator_strategy', 'reader_queue_speed_test_mode' 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir'
] ]
if os.name != 'nt': if os.name != 'nt':
read_env_flags.append('warpctc_dir') read_env_flags.append('warpctc_dir')
......
...@@ -5776,7 +5776,7 @@ def image_resize(input, ...@@ -5776,7 +5776,7 @@ def image_resize(input,
Examples: Examples:
.. code-block:: python .. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12]) out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
""" """
resample_methods = { resample_methods = {
'BILINEAR': 'bilinear', 'BILINEAR': 'bilinear',
...@@ -5879,6 +5879,11 @@ def resize_bilinear(input, ...@@ -5879,6 +5879,11 @@ def resize_bilinear(input,
Returns: Returns:
${out_comment}. ${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
""" """
return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape) return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
...@@ -5925,6 +5930,11 @@ def resize_nearest(input, ...@@ -5925,6 +5930,11 @@ def resize_nearest(input,
Returns: Returns:
${out_comment}. ${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
""" """
return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape) return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册