提交 baf02328 编写于 作者: Q Qiao Longfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor-communicator
test=develop
......@@ -34,7 +34,7 @@ ExternalProject_Add(
BUILD_IN_SOURCE 1
)
ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL)
ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
ADD_DEPENDENCIES(dgc extern_dgc)
......
此差异已折叠。
......@@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
......@@ -25,8 +25,12 @@ if(WITH_DISTRIBUTE)
endif()
if(WITH_GPU)
set(dgc_deps "")
if(NOT WIN32)
set(dgc_deps dgc)
endif()
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor dgc)
dynload_cuda variable_visitor ${dgc_deps})
nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
if(WITH_DISTRIBUTE)
......
......@@ -68,7 +68,7 @@ void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
VLOG(1) << "in all_reduce_deps_pass:" << v;
VLOG(10) << "in all_reduce_deps_pass:" << v;
}
}
order++;
......
......@@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const {
continue;
TryInplaceOpInputOutput(op, graph);
}
// graph->ResolveHazard(var_nodes_);
}
void InplacePass::InplaceModifyDesc(const std::string& var,
......@@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush();
}
}
......@@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name();
// PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
// "op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced.
PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
......@@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
// check if op2 depends on op1's output
bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
if (VLOG_IS_ON(4)) {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
}
if (op1 == op2) return true;
if (op_level_.at(op1) >= op_level_.at(op2)) return false;
......
......@@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) {
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FIXME(liuwei1031) this API has changed,
// disable these tests temporarily
// FindNextBestFitNode
// auto* n = nodes[0].get();
// auto* cache = pool.FindBestFitNode(n);
// PADDLE_ENFORCE(cache->Name() == "a");
// cache = pool.FindNextBestFitNode(n, cache);
// PADDLE_ENFORCE(cache->Name() == "c");
// cache = pool.FindNextBestFitNode(n, cache);
// PADDLE_ENFORCE(cache->Name() == "b");
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c");
auto* cache_b = pool.FindNextBestFitNode(n, cache);
ASSERT_TRUE(cache_b->Name() != cache->Name());
ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache_b);
ASSERT_TRUE(cache == nullptr);
}
} // namespace details
......
......@@ -12,9 +12,14 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <iterator>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/inplace_op_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
......@@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
namespace paddle {
namespace framework {
// TEST(InferInplace, SingleOpInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("single_op");
// op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
// op->SetOutput("Out", {"test2_out"});
//
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_out");
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 1ul);
// auto it = in_to_outs.begin();
// EXPECT_EQ(it->first, "test2_a");
// EXPECT_EQ(it->second, "test2_out");
// }
//
// TEST(InferInplace, SingleGradOpInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("single_op_grad");
// op->SetInput(GradVarName("Out"), {"test2_out"});
// op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
//
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_out");
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 1ul);
// auto it = in_to_outs.begin();
// EXPECT_EQ(it->first, "test2_out");
// EXPECT_EQ(it->second, "test2_a");
// }
//
// TEST(InferInplace, MultiOutInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("multi_out_op");
// op->SetInput("X", {"a0", "a1"});
// op->SetInput("Y", {"b0"});
// op->SetInput("Z", {"c0", "c1"});
// op->SetOutput("Out", {"o0"});
// op->SetOutput("YOut", {"y0"});
// op->SetOutput("ZOut", {"z0"});
//
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("o0");
// prog.MutableBlock(0)->Var("y0");
// prog.MutableBlock(0)->Var("z0");
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 3ul);
// std::unordered_map<std::string, std::string> expects = {
// {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
// };
// EXPECT_TRUE(expects == in_to_outs);
// }
//
// TEST(InferInplace, MultiGradInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("multi_out_grad");
// op->SetInput(GradVarName("Out"), {"o0"});
// op->SetInput(GradVarName("YOut"), {"y0"});
// op->SetInput(GradVarName("ZOut"), {"z0"});
// op->SetOutput(GradVarName("X"), {"a0", "a1"});
// op->SetOutput(GradVarName("Y"), {"b0"});
// op->SetOutput(GradVarName("Z"), {"c0", "c1"});
//
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("o0");
// prog.MutableBlock(0)->Var("y0");
// prog.MutableBlock(0)->Var("z0");
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
//
// EXPECT_EQ(in_to_outs.size(), 3ul);
// std::unordered_map<std::string, std::string> expects = {
// {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
// };
// EXPECT_TRUE(expects == in_to_outs);
// }
void FakeSuccData(ProgramDesc* prog) { // NOLINT
prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_out");
prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128});
}
void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT
prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_out");
prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128});
}
ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) {
ir::Node* op_node = nullptr;
for (auto& item : g->Nodes()) {
if (item->Name() == name) {
op_node = item;
break;
}
}
return op_node;
}
std::unique_ptr<ir::Graph> test_SingleOpInplaceInToOut(
std::unique_ptr<ir::Graph> g) {
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_NE(op_node, nullptr);
pass->Apply(g.get());
return g;
}
TEST(InferInplace, SingleOpInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op");
op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
op->SetOutput("Out", {"test2_out"});
FakeSuccData(&prog);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
g = test_SingleOpInplaceInToOut(std::move(g));
auto op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a");
}
TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op");
op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
op->SetOutput("Out", {"test2_out"});
FakeNoInplaceData(&prog);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
g = test_SingleOpInplaceInToOut(std::move(g));
auto op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out");
}
TEST(InferInplace, MultiOutInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_op");
op->SetInput("X", {"a0", "a1"});
op->SetInput("Y", {"b0"});
op->SetInput("Z", {"c0", "c1"});
op->SetOutput("Out", {"o0"});
op->SetOutput("YOut", {"y0"});
op->SetOutput("ZOut", {"z0"});
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
pass->Apply(g.get());
auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
ASSERT_TRUE(op_node != nullptr);
EXPECT_EQ(op_node->outputs[0]->Name(), "a0");
EXPECT_EQ(op_node->outputs[1]->Name(), "b0");
EXPECT_EQ(op_node->outputs[2]->Name(), "c0");
}
TEST(InferInplace, MultiGradInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_grad");
op->SetInput(GradVarName("Out"), {"o0"});
op->SetInput(GradVarName("YOut"), {"y0"});
op->SetInput(GradVarName("ZOut"), {"z0"});
op->SetOutput(GradVarName("X"), {"a0", "a1"});
op->SetOutput(GradVarName("Y"), {"b0"});
op->SetOutput(GradVarName("Z"), {"c0", "c1"});
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
pass->Apply(g.get());
auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
ASSERT_TRUE(op_node != nullptr);
EXPECT_EQ(op_node->outputs[0]->Name(), "o0");
EXPECT_EQ(op_node->outputs[2]->Name(), "y0");
EXPECT_EQ(op_node->outputs[3]->Name(), "c0");
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
};
}
} // namespace framework
} // namespace paddle
......@@ -68,21 +68,12 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base)
pass_library(simplify_anakin_detection_pattern_pass inference)
pass_library(anakin_fillconstant_elementwisemul_fuse inference)
pass_library(quant_conv2d_dequant_fuse_pass inference)
pass_library(fillconstant_elementwisemul_fuse inference)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
endforeach()
if(ANAKIN_FOUND)
pass_library(simplify_anakin_priorbox_detection_out_pass inference)
endif()
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base mkldnn)
......
......@@ -15,7 +15,7 @@
#include <memory>
#include <string>
#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
......@@ -29,8 +29,8 @@ namespace ir {
GET_IR_NODE(elementwise_mul); \
GET_IR_NODE(elementwise_mul_out);
void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "fillconstant_elementwisemul_fuse";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
......@@ -39,8 +39,8 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
->assert_is_op_input("elementwise_mul", "X")
->AsInput();
patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
pattern(x);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
......@@ -79,5 +79,5 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
REGISTER_PASS(fillconstant_elementwisemul_fuse,
paddle::framework::ir::FillconstantElementwisemulFuse);
......@@ -21,9 +21,9 @@ namespace paddle {
namespace framework {
namespace ir {
class AnakinFillconstantElementwisemulFuse : public FusePassBase {
class FillconstantElementwisemulFuse : public FusePassBase {
public:
virtual ~AnakinFillconstantElementwisemulFuse() {}
virtual ~FillconstantElementwisemulFuse() {}
protected:
void ApplyImpl(ir::Graph* graph) const override;
......
......@@ -1471,7 +1471,8 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
}
PDNode *patterns::AnakinDetectionPattern::operator()(
std::vector<PDNode *> conv_in, int times) {
std::vector<PDNode *> conv_in, int times, std::string priorbox_type,
bool is_reshape) {
// The times represents the repeat times of the
// {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
const int kNumFields = 7;
......@@ -1486,37 +1487,38 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
const int kMultiClassSecondInputNmsOffset = times + 1;
std::vector<PDNode *> nodes;
std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2";
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
->assert_is_op("density_prior_box"));
->assert_is_op(priorbox_type));
nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Boxes")
->assert_is_op_input("reshape2", "X")
->assert_is_op_output(priorbox_type, "Boxes")
->assert_is_op_input(op_after_priorbox, "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
->assert_is_op("reshape2"));
->assert_is_op(op_after_priorbox));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_output(op_after_priorbox)
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Variances")
->assert_is_op_input("reshape2", "X")
->assert_is_op_output(priorbox_type, "Variances")
->assert_is_op_input(op_after_priorbox, "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
->assert_is_op("reshape2"));
->assert_is_op(op_after_priorbox));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_output(op_after_priorbox)
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
......@@ -1612,7 +1614,7 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
return multiclass_nms_out;
}
PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
PDNode *elementwise_op_input) {
auto fill_constant =
pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
......@@ -1635,6 +1637,76 @@ PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
return elementwise_mul_out;
}
void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const std::string &op_type,
const std::string &weight_name,
int times) {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
// the quant op always be one.
auto quant_op_in_scale =
pattern->NewNode(GetNodeName("quant_op_in_scale"))
->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
->AsInput();
auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
->assert_is_op("fake_quantize_range_abs_max");
auto quant_op_out_scale =
pattern->NewNode(GetNodeName("quant_op_out_scale"))
->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
->AsIntermediate();
auto quant_op_out =
pattern->NewNode(GetNodeName("quant_op_out"))
->assert_is_op_output("fake_quantize_range_abs_max", "Out")
->assert_is_op_input(op_type)
->AsIntermediate();
// there are 'times' quantized and dequant op
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i))
->assert_is_op_input(op_type, weight_name)
->AsInput());
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i))
->assert_is_op(op_type));
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
->assert_is_op_output(op_type)
->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
->assert_is_op("fake_dequantize_max_abs"));
nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
->assert_is_op_output("fake_dequantize_max_abs", "Out")
->AsOutput());
}
quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
quant_op_out->LinksFrom({quant_op});
for (int i = 0; i < times; i++) {
nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
{quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOffset]});
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kDequantOpOffset]});
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -848,7 +848,8 @@ struct AnakinDetectionPattern : public PatternBase {
AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times,
std::string priorbox_type, bool is_reshape);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
......@@ -859,9 +860,9 @@ struct AnakinDetectionPattern : public PatternBase {
}
};
struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
struct FillConstantElementWiseMulFuse : public PatternBase {
FillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope,
"anakin_fillconstant_elementwisemul_fuse") {}
......@@ -874,6 +875,22 @@ struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
PATTERN_DECL_NODE(elementwise_mul_out);
};
struct QuantDequantOpFuse : public PatternBase {
QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
void operator()(PDNode* quant_op_input, const std::string& op_name,
const std::string& weight_name, int times = 1);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
} // namespace patterns
// Link two ir::Nodes from each other.
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std::string op_type) {
const std::string pattern_name = "quant_dequant_fuse";
// FusePassBase::Init(pattern_name, graph);
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("fake_quantize_range_abs_max", "X")
->AsInput();
std::string quantized_op_type = "";
std::string weight_name = "";
if (op_type == "conv2d") {
quantized_op_type = "conv2d";
weight_name = "Filter";
} else if (op_type == "conv2d_fusion") {
quantized_op_type = "conv2d_fusion";
weight_name = "Filter";
} else if (op_type == "mul") {
quantized_op_type = "mul";
weight_name = "Y";
} else if (op_type == "fc") {
quantized_op_type = "fc";
weight_name = "W";
} else {
PADDLE_ENFORCE(
"QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
"now.");
}
patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
pattern(x, quantized_op_type, weight_name, times);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
PADDLE_ENFORCE(subgraph.count(x));
auto* input_node = subgraph.at(x);
Node* quant_op_in_scale =
subgraph.at(pattern.GetPDNode("quant_op_in_scale"));
Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op"));
Node* quant_op_out_scale =
subgraph.at(pattern.GetPDNode("quant_op_out_scale"));
Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out"));
std::vector<Node*> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(subgraph.at(
pattern.GetPDNode("quantized_op_weight" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i))));
nodes.push_back(subgraph.at(
pattern.GetPDNode("quantized_op_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
}
int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
int range = ((1 << (bit_length - 1)) - 1);
// Prepare input scale
std::string input_scale_var_name = quant_op->Op()->Input("InScale").front();
PADDLE_ENFORCE(scope);
const LoDTensor& input_scale_tensor =
scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place()));
const float* input_scale_data = input_scale_tensor.data<float>();
float input_scale = input_scale_data[0];
std::unordered_set<const Node*> delete_nodes;
for (int i = 0; i < times; i++) {
// max_range = (range * range) / weight_scale
float max_range = boost::get<float>(
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
float weight_scale = (range * range) / max_range;
auto base_op_desc =
*nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
std::string new_input = input_node->Name();
std::string new_output =
nodes[i * kNumFields + kDequantOpOutOffset]->Name();
framework::OpDesc new_op_desc(base_op_desc, nullptr);
new_op_desc.SetType(quantized_op_type);
if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Out", {new_output});
} else if (quantized_op_type == "mul") {
new_op_desc.SetInput("X", {new_input});
new_op_desc.SetOutput("Out", {new_output});
}
new_op_desc.SetAttr("enable_int8", true);
new_op_desc.SetAttr("input_scale", input_scale);
new_op_desc.SetAttr("weight_scale", weight_scale);
new_op_desc.Flush();
auto* new_op = graph->CreateOpNode(&new_op_desc);
IR_NODE_LINK_TO(input_node, new_op);
IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
}
delete_nodes.insert(quant_op_in_scale);
delete_nodes.insert(quant_op);
delete_nodes.insert(quant_op_out);
delete_nodes.insert(quant_op_out_scale);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph, delete_nodes);
};
gpd(graph, handler);
}
void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "quant_dequant_fuse";
FusePassBase::Init(pattern_name, graph);
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
auto* scope = param_scope();
for (auto& op_type : quantized_op_types) {
for (int i = 1; i <= 6; i++) {
RunQuantDequant(graph, scope, i, op_type);
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
paddle::framework::ir::QuantDequantFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class QuantDequantFusePass : public FusePassBase {
public:
virtual ~QuantDequantFusePass() {}
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -17,25 +17,24 @@
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
ir::Graph *graph) const {
void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density,
bool is_reshape) {
const std::string pattern_name =
"simplify_anakin_detection_pattern_pass" + std::to_string(times);
FusePassBase::Init(pattern_name, graph);
std::string priorbox_type = is_density ? "density_prior_box" : "prior_box";
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("density_prior_box", "Input")
->assert_is_op_input(priorbox_type, "Input")
->AsInput());
}
input_nodes.push_back(gpd.mutable_pattern()
......@@ -49,7 +48,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
->AsInput());
patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
pattern(input_nodes, times, priorbox_type, is_reshape);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
......@@ -119,8 +118,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
bool box_normalized =
boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
// auto variance =
// boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
int background_label =
boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
float score_threshold =
......@@ -138,7 +136,6 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
}
// int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
framework::OpDesc concat1_desc;
concat1_desc.SetType("concat");
concat1_desc.SetInput("X", concat1_input_names);
......@@ -213,31 +210,24 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
gpd(graph, handler);
}
template class SimplifyAnakinDetectionPatternPass<1>;
template class SimplifyAnakinDetectionPatternPass<2>;
template class SimplifyAnakinDetectionPatternPass<3>;
template class SimplifyAnakinDetectionPatternPass<4>;
template class SimplifyAnakinDetectionPatternPass<5>;
template class SimplifyAnakinDetectionPatternPass<6>;
void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const {
const int pattern_nums = 6;
const std::string pattern_name = "simplify_anakin_detection_pattern_pass";
FusePassBase::Init(pattern_name, graph);
std::vector<bool> options = {true, false};
for (const auto &is_density : options) {
for (const auto &is_reshape : options) {
for (int i = 1; i <= pattern_nums; i++) {
RunSimplifyAnakinDetection(graph, i, is_density, is_reshape);
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(simplify_anakin_detection_pattern_pass,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass
priorbox_pattern;
REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern);
......@@ -26,7 +26,6 @@ namespace ir {
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class SimplifyAnakinDetectionPatternPass : public FusePassBase {
public:
virtual ~SimplifyAnakinDetectionPatternPass() {}
......
......@@ -25,11 +25,9 @@ namespace paddle {
namespace framework {
namespace ir {
template <int times>
void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
const std::string pattern_name =
"transpose_flatten" + std::to_string(times) + "_concat_fuse";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
......@@ -122,31 +120,18 @@ void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
gpd(graph, handler);
}
template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<2>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
template class TransposeFlattenConcatFusePass<6>;
void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
const int pattern_nums = 6;
const std::string pattern_name = "transpose_flatten_concat_fuse";
FusePassBase::Init(pattern_name, graph);
for (int i = 1; i <= pattern_nums; i++) {
RunTransposeFlattenConcatFuse(graph, i);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
paddle::framework::ir::TransposeFlattenConcatFusePass);
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......@@ -24,7 +26,6 @@ namespace ir {
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class TransposeFlattenConcatFusePass : public FusePassBase {
public:
virtual ~TransposeFlattenConcatFusePass() {}
......
......@@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
}
}
static DDim GetDims(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
static DDim GetDimsDebug(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return DDim({-1});
......@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) {
// return DDim({-1});
// }
if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1});
}
return tensor.dims();
} else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) {
......@@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) {
return -1;
}
static LoD GetLoD(const Scope& scope, const std::string& name) {
static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
auto default_lod = LoD({{}});
......@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) {
// return default_lod;
// }
if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod;
}
return tensor.lod();
} else {
return default_lod;
......@@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
}
std::string dtype = GetDtype(*scope, var_name);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")";
ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoDDebug(*scope, var_name) << ")";
}
}
if (i != input.second.size() - 1) {
......@@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
}
std::string dtype = GetDtype(*scope, output.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")";
ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoDDebug(*scope, var_name) << ")";
}
}
if (i != output.second.size() - 1) {
......@@ -1017,7 +1017,7 @@ Scope* OperatorWithKernel::PrepareData(
// of search key even though the set is empty.
if (!no_buffer_ins.empty() &&
no_buffer_ins.count(var_name_item.first) > 0) {
VLOG(1) << "Skip scanning input " << var_name_item.first
VLOG(7) << "Skip scanning input " << var_name_item.first
<< " in Operator " << type_;
continue;
}
......
......@@ -34,25 +34,41 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
auto input_name = op_desc.Input("Input").front();
auto image_name = op_desc.Input("Image").front();
auto output_name = op_desc.Output("Boxes").front();
auto op_type = op_desc.Type();
auto op_name = op_type + ":" + op_desc.Output("Boxes").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
// only for density_prior_box
std::vector<float> fixed_sizes = {};
std::vector<float> fixed_ratios = {};
std::vector<int> densities = {};
auto fixed_sizes =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
auto fixed_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
std::vector<float> min_sizes = {};
std::vector<float> max_sizes = {};
std::vector<float> aspect_ratios = {};
bool is_clip = false;
bool is_flip = false;
if (op_type == "density_prior_box") {
fixed_sizes =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
fixed_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
} else if (op_type == "prior_box") {
min_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("min_sizes"));
max_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("max_sizes"));
aspect_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("aspect_ratios"));
is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
is_flip = boost::get<bool>(op_desc.GetAttr("flip"));
}
std::vector<float> dens;
for (auto& ele : densities) {
dens.push_back(static_cast<float>(ele));
}
// lack flip
// auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
for (auto& ele : variances) {
LOG(INFO) << ele;
}
// lack img_h, img_w
auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
......@@ -66,14 +82,14 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
std::vector<float> temp_v = {};
engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
engine_->AddOpAttr(op_name, "is_flip", is_flip);
engine_->AddOpAttr(op_name, "is_clip", is_clip);
engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
......@@ -88,3 +104,4 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
......@@ -48,7 +48,7 @@ class AnakinOpConverter {
framework::OpDesc op_desc(op, nullptr);
std::string op_type = op_desc.Type();
AnakinOpConverter *it = nullptr;
if (op_type == "depthwise_conv2d") op_type = "conv2d";
if (op_type == "reshape2") op_type = "reshape";
if (op_type == "transpose2") op_type = "transpose";
if (op_type == "flatten2") op_type = "flatten";
......
......@@ -42,6 +42,8 @@ struct SimpleOpTypeSetTeller : public Teller {
teller_set.insert("dropout");
teller_set.insert("sigmoid");
teller_set.insert("sum");
teller_set.insert("depthwise_conv2d");
teller_set.insert("prior_box");
}
bool operator()(const std::string& op_type,
......
......@@ -37,14 +37,14 @@ using framework::ir::Node;
void analysis::AnakinSubgraphPass::ApplyImpl(
framework::ir::Graph *graph) const {
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */);
fuser();
std::vector<std::string> graph_param_names =
......@@ -56,10 +56,10 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
CreateAnakinOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
}
}
......@@ -69,7 +69,7 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
nodes2remove.insert(node);
}
}
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
}
......
......@@ -192,6 +192,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
block_desc.Proto()->SerializeAsString());
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "gpu_id", Get<int>("gpu_device_id"));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
......
......@@ -52,6 +52,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
scope->EraseVars({var_name});
continue;
}
auto *var = scope->FindLocalVar(var_name);
......
......@@ -886,4 +886,5 @@ USE_ANAKIN_CONVERTER(detection_out);
USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER(dropout);
USE_ANAKIN_CONVERTER(sum);
USE_ANAKIN_CONVERTER(prior_box);
#endif
......@@ -70,17 +70,15 @@ void GpuPassStrategy::EnableMKLDNN() {
// The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", //
"simplify_anakin_detection_pattern_pass5", //
"simplify_anakin_detection_pattern_pass4", //
"simplify_anakin_detection_pattern_pass3", //
"simplify_anakin_detection_pattern_pass2", //
"anakin_fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"infer_clean_graph_pass", //
"simplify_anakin_priorbox_detection_out_pass", //
"fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"quant_conv2d_dequant_fuse_pass", //
"anakin_subgraph_pass",
});
......@@ -97,13 +95,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif
#endif //
"transpose_flatten_concat_fuse_pass",
});
for (int i = 6; i >= 2; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
......
......@@ -23,6 +23,12 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction()
function(inference_analysis_api_int8_test target model_dir data_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename}
......@@ -138,6 +144,28 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# int8 image classification tests
if(WITH_MKLDNN)
set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
if (NOT EXISTS ${INT8_DATA_DIR})
inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz")
endif()
#resnet50 int8
set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
#mobilenet int8
set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
endif()
# bert, max_len=20, embedding_dim=128
set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
......
......@@ -53,19 +53,6 @@ void Split(const std::string &line, char sep, std::vector<T> *v) {
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32(iterations, 0, "Number of iterations");
namespace paddle {
namespace inference {
namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN();
}
template <typename T>
class TensorReader {
public:
TensorReader(std::ifstream &file, size_t beginning_offset,
std::vector<int> shape, std::string name)
: file_(file), position(beginning_offset), shape_(shape), name_(name) {
numel =
std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
}
PaddleTensor NextBatch() {
PaddleTensor tensor;
tensor.name = name_;
tensor.shape = shape_;
tensor.dtype = GetPaddleDType<T>();
tensor.data.Resize(numel * sizeof(T));
file_.seekg(position);
file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
position = file_.tellg();
if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
if (file_.fail())
throw std::runtime_error(name_ + ": failed reading file.");
return tensor;
}
protected:
std::ifstream &file_;
size_t position;
std::vector<int> shape_;
std::string name_;
size_t numel;
};
std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
int test_data_batch_size = test_data[0][0].shape[0];
CHECK_LE(static_cast<size_t>(num_images),
test_data.size() * test_data_batch_size);
PaddleTensor images;
images.name = "input";
images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels;
labels.name = "labels";
labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images);
for (int i = 0; i < num_images; i++) {
auto batch = i / test_data_batch_size;
auto element_in_batch = i % test_data_batch_size;
std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
element_in_batch * 3 * 224 * 224,
3 * 224 * 224,
static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
element_in_batch,
1, static_cast<int64_t *>(labels.data.data()) + i);
}
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
(*warmup_data)[0] = std::move(images);
(*warmup_data)[1] = std::move(labels);
return warmup_data;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
int32_t batch_size = FLAGS_batch_size) {
std::ifstream file(FLAGS_infer_data, std::ios::binary);
if (!file) {
FAIL() << "Couldn't open file: " << FLAGS_infer_data;
}
int64_t total_images{0};
file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
LOG(INFO) << "Total images in file: " << total_images;
std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
std::vector<int> label_batch_shape{batch_size, 1};
auto labels_offset_in_file =
static_cast<size_t>(file.tellg()) +
sizeof(float) * total_images *
std::accumulate(image_batch_shape.begin() + 1,
image_batch_shape.end(), 1, std::multiplies<int>());
TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label");
auto iterations = total_images / batch_size;
if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
iterations = FLAGS_iterations;
for (auto i = 0; i < iterations; i++) {
auto images = image_reader.NextBatch();
auto labels = label_reader.NextBatch();
inputs->emplace_back(
std::vector<PaddleTensor>{std::move(images), std::move(labels)});
}
}
TEST(Analyzer_int8_resnet50, quantization) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig q_cfg;
SetConfig(&q_cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all, 100);
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100);
q_cfg.EnableMkldnnQuantizer();
q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
CompareQuantizedAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
input_slots_all);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import os
import numpy as np
import time
import sys
import random
import functools
import contextlib
from PIL import Image, ImageEnhance
import math
from paddle.dataset.common import download
random.seed(0)
np.random.seed(0)
DATA_DIM = 224
SIZE_FLOAT32 = 4
SIZE_INT64 = 8
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = np.random.randint(0, width - size + 1)
h_start = np.random.randint(0, height - size + 1)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def process_image(img_path, mode, color_jitter, rotate):
img = Image.open(img_path)
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
return img
def download_unzip():
int8_download = 'int8/download'
target_name = 'data'
cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
int8_download)
target_folder = os.path.join(cache_folder, target_name)
data_urls = []
data_md5s = []
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
)
data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
)
data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
file_names = []
for i in range(0, len(data_urls)):
download(data_urls[i], cache_folder, data_md5s[i])
file_names.append(data_urls[i].split('/')[-1])
zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
if not os.path.exists(zip_path):
cat_command = 'cat'
for file_name in file_names:
cat_command += ' ' + os.path.join(cache_folder, file_name)
cat_command += ' > ' + zip_path
os.system(cat_command)
print('Data is downloaded at {0}\n').format(zip_path)
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path)
os.system(cmd)
print('Data is unzipped at {0}\n'.format(target_folder))
data_dir = os.path.join(target_folder, 'ILSVRC2012')
print('ILSVRC2012 full val set at {0}\n'.format(data_dir))
return data_dir
def reader():
data_dir = download_unzip()
file_list = os.path.join(data_dir, 'val_list.txt')
output_file = os.path.join(data_dir, 'int8_full_val.bin')
with open(file_list) as flist:
lines = [line.strip() for line in flist]
num_images = len(lines)
if not os.path.exists(output_file):
print(
'Preprocessing to binary file...<num_images><all images><all labels>...\n'
)
with open(output_file, "w+b") as of:
#save num_images(int64_t) to file
of.seek(0)
num = np.array(int(num_images)).astype('int64')
of.write(num.tobytes())
for idx, line in enumerate(lines):
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
if not os.path.exists(img_path):
continue
#save image(float32) to file
img = process_image(
img_path, 'val', color_jitter=False, rotate=False)
np_img = np.array(img)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* idx)
of.write(np_img.astype('float32').tobytes())
#save label(int64_t) to file
label_int = (int)(label)
np_label = np.array(label_int)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* num_images + idx * SIZE_INT64)
of.write(np_label.astype('int64').tobytes())
print('The preprocessed binary file path {}\n'.format(output_file))
if __name__ == '__main__':
reader()
......@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile);
......@@ -58,6 +59,19 @@ DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace inference {
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config =
reinterpret_cast<const AnalysisConfig *>(config);
......@@ -392,6 +406,32 @@ void TestPrediction(const PaddlePredictor::Config *config,
}
}
void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
const std::vector<PaddleTensor> &output_slots2) {
// first output: avg_cost
if (output_slots1.size() == 0 || output_slots2.size() == 0)
throw std::invalid_argument(
"CompareTopAccuracy: output_slots vector is empty.");
PADDLE_ENFORCE(output_slots1.size() >= 2UL);
PADDLE_ENFORCE(output_slots2.size() >= 2UL);
// second output: acc_top1
if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output is of a wrong type.");
float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
CHECK_LE(std::abs(*top1_quantized - *top1_reference),
FLAGS_quantized_accuracy);
}
void CompareDeterministic(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) {
......@@ -421,6 +461,17 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs);
}
void CompareQuantizedAndAnalysis(
const PaddlePredictor::Config *config,
const PaddlePredictor::Config *qconfig,
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true);
std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
CompareTopAccuracy(quantized_outputs, analysis_outputs);
}
void CompareNativeAndAnalysis(
PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
const std::vector<std::vector<PaddleTensor>> &inputs) {
......
......@@ -23,6 +23,16 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
// reason we set it to false by default is that this mode may use scaled
// atomic integer reduction that may cause a numerical overflow for certain
// input data range.
DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
"Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
"batch_norm, defalut is False.");
namespace paddle {
namespace operators {
......@@ -76,7 +86,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#if CUDNN_VERSION_MIN(7, 0, 0)
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif
......@@ -302,7 +316,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#if CUDNN_VERSION_MIN(7, 0, 0)
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif
......
......@@ -386,7 +386,7 @@ void BenchKernelSoftmax() {
RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs, 1);
}
}
}
......
......@@ -34,6 +34,7 @@ const char* to_string(KernelType kt) {
ONE_CASE(kVAddRelu);
ONE_CASE(kVSub);
ONE_CASE(kVScal);
ONE_CASE(kStrideScal);
ONE_CASE(kVAddBias);
ONE_CASE(kVRelu);
ONE_CASE(kVBroadcast);
......@@ -55,6 +56,7 @@ const char* to_string(KernelType kt) {
ONE_CASE(kMatMul);
ONE_CASE(kHMax);
ONE_CASE(kHSum);
ONE_CASE(kStrideASum);
ONE_CASE(kSoftmax);
ONE_CASE(kEmbSeqPool);
ONE_CASE(kSgd);
......
......@@ -38,6 +38,8 @@ typedef enum {
kNCHW16CMulNC,
kSeqPool,
kSoftmax,
kStrideASum,
kStrideScal,
kVAdd,
kVAddBias,
kVAddRelu,
......@@ -74,6 +76,14 @@ struct XYZNTuple {
template <typename T>
struct AXYNTuple : public XYZNTuple<T> {};
// a, x, y, n, stride
template <typename T>
struct AXYNSTuple {
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, const T*, T*, int, int);
};
// x, y, n
template <typename T>
struct XYNTuple {
......@@ -86,6 +96,14 @@ struct XYNTuple {
template <typename T>
struct XRNTuple : public XYNTuple<T> {};
// x, returned value, n, stride
template <typename T>
struct XRNSTuple {
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, T*, int, int);
};
#define DECLARE_KERNELTUPLE(kernel_tuple, type) \
template <typename T> \
struct type##Tuple : public kernel_tuple<T> { \
......@@ -101,6 +119,8 @@ DECLARE_KERNELTUPLE(XYZNTuple, VSub);
DECLARE_KERNELTUPLE(AXYNTuple, VScal);
DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal);
DECLARE_KERNELTUPLE(XYNTuple, VRelu);
DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
DECLARE_KERNELTUPLE(XYNTuple, VSquare);
......@@ -112,6 +132,8 @@ DECLARE_KERNELTUPLE(XYNTuple, VCopy);
DECLARE_KERNELTUPLE(XRNTuple, HMax);
DECLARE_KERNELTUPLE(XRNTuple, HSum);
DECLARE_KERNELTUPLE(XRNSTuple, StrideASum);
typedef struct {
void* gates; // gates: x_ch, x_ih, x_fh, x_oh
const void* ct_1;
......@@ -285,7 +307,7 @@ struct SoftmaxTuple {
static constexpr KernelType kernel_type = kSoftmax;
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, T*, int, int);
typedef void (*func_type)(const T*, T*, int, int, int);
};
// nChw16c = nChw16c .* NC
......
......@@ -50,10 +50,15 @@ void VTanh(const T* x, T* y, int n) {
compute_addbias(&b, y, y, n);
}
void Softmax(const T* x, T* y, int n, int bs) {
// remain is the product of dimension shapes after the axis dimension
void Softmax(const T* x, T* y, int n, int bs, int remain) {
auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
auto compute_strideasum =
KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
auto compute_stridescal =
KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
auto compute_vaddbias =
KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
......@@ -64,9 +69,17 @@ void Softmax(const T* x, T* y, int n, int bs) {
scalar = static_cast<T>(0) - scalar;
compute_vaddbias(&scalar, x, y, n); // x - max
compute_vexp(y, y, n);
compute_hsum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
compute_vscal(&scalar, y, y, n);
if (remain == 1) {
compute_hsum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
compute_vscal(&scalar, y, y, n);
} else {
for (int j = 0; j < remain; ++j) {
compute_strideasum(&y[j], &scalar, n, remain);
scalar = static_cast<T>(1) / scalar;
compute_stridescal(&scalar, &y[j], &y[j], n, remain);
}
}
x += n;
y += n;
}
......
......@@ -26,7 +26,7 @@ using T = float;
void VSigmoid(const T* x, T* y, int n);
void VTanh(const T* x, T* y, int n);
void Softmax(const T* x, T* y, int n, int bs);
void Softmax(const T* x, T* y, int n, int bs, int remain);
void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
......
......@@ -7,6 +7,7 @@ USE_JITKERNEL_MORE(kMatMul, mkl)
USE_JITKERNEL_MORE(kVMul, mkl)
USE_JITKERNEL_MORE(kVAdd, mkl)
USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE(kStrideScal, mkl)
USE_JITKERNEL_MORE(kVExp, mkl)
USE_JITKERNEL_MORE(kVSquare, mkl)
USE_JITKERNEL_MORE(kVCopy, mkl)
......
......@@ -78,6 +78,26 @@ void VScal<double>(const double* a, const double* x, double* y, int n) {
}
}
template <>
void StrideScal<float>(const float* a, const float* x, float* y, int n,
int stride) {
if (x == y) {
platform::dynload::cblas_sscal(n / stride, *a, y, stride);
} else {
refer::StrideScal<float>(a, x, y, n, stride);
}
}
template <>
void StrideScal<double>(const double* a, const double* x, double* y, int n,
int stride) {
if (x == y) {
platform::dynload::cblas_dscal(n / stride, *a, y, stride);
} else {
refer::StrideScal<double>(a, x, y, n, stride);
}
}
template <>
void VExp<float>(const float* x, float* y, int n) {
platform::dynload::vsExp(n, x, y);
......@@ -128,6 +148,16 @@ void ASum<double>(const double* x, double* res, int n) {
res[0] = platform::dynload::cblas_dasum(n, x, 1);
}
template <>
void StrideASum<float>(const float* x, float* res, int n, int stride) {
res[0] = platform::dynload::cblas_sasum(n / stride, x, stride);
}
template <>
void StrideASum<double>(const double* x, double* res, int n, int stride) {
res[0] = platform::dynload::cblas_dasum(n / stride, x, stride);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <>
bool VMulKernel<float>::CanBeUsed(const int& d) const {
......@@ -144,6 +174,11 @@ bool VScalKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512;
}
template <>
bool StrideScalKernel<float>::CanBeUsed(const int& d) const {
return true;
}
template <>
bool VExpKernel<float>::CanBeUsed(const int& d) const {
return d > 7;
......@@ -235,6 +270,7 @@ bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
AWALYS_USE_ME_WITH_DOUBLE(VMul);
AWALYS_USE_ME_WITH_DOUBLE(VAdd);
AWALYS_USE_ME_WITH_DOUBLE(VScal);
AWALYS_USE_ME_WITH_DOUBLE(StrideScal);
AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
AWALYS_USE_ME_WITH_DOUBLE(VTanh);
......@@ -259,6 +295,7 @@ REGISTER_MKL_KERNEL(MatMul);
REGISTER_MKL_KERNEL(VMul);
REGISTER_MKL_KERNEL(VAdd);
REGISTER_MKL_KERNEL(VScal);
REGISTER_MKL_KERNEL(StrideScal);
REGISTER_MKL_KERNEL(VExp);
REGISTER_MKL_KERNEL(VSquare);
REGISTER_MKL_KERNEL(VCopy);
......
......@@ -129,7 +129,14 @@ template <typename T>
void ASum(const T* x, T* res, int n);
template <typename T>
void Softmax(const T* x, T* y, int n, int bs) {
void StrideASum(const T* x, T* res, int n, int stride);
template <typename T>
void StrideScal(const T* a, const T* x, T* y, int n, int stride);
// remain is the product of dimension shapes after the axis dimension
template <typename T>
void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
std::vector<T> entities(bs);
for (int i = 0; i < bs; ++i) {
entities[i] = x[i * n];
......@@ -143,9 +150,17 @@ void Softmax(const T* x, T* y, int n, int bs) {
VExp(y, y, n * bs);
for (int i = 0; i < bs; ++i) {
T sum;
ASum(&y[i * n], &sum, n);
sum = static_cast<T>(1) / sum;
VScal(&sum, &y[i * n], &y[i * n], n);
if (remain == 1) {
ASum(&y[i * n], &sum, n);
sum = static_cast<T>(1) / sum;
VScal(&sum, &y[i * n], &y[i * n], n);
} else {
for (int j = 0; j < remain; ++j) {
StrideASum(&y[i * n + j], &sum, n, remain);
sum = static_cast<T>(1) / sum;
StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain);
}
}
}
}
......@@ -193,6 +208,7 @@ DECLARE_MKL_KERNEL(VAdd);
// AXYN
DECLARE_MKL_KERNEL(VScal);
DECLARE_MKL_KERNEL(StrideScal);
// XYN
DECLARE_MKL_KERNEL(VExp);
......
......@@ -12,6 +12,7 @@ USE_JITKERNEL_REFER(kVAdd)
USE_JITKERNEL_REFER(kVAddRelu)
USE_JITKERNEL_REFER(kVSub)
USE_JITKERNEL_REFER(kVScal)
USE_JITKERNEL_REFER(kStrideScal)
USE_JITKERNEL_REFER(kVAddBias)
USE_JITKERNEL_REFER(kVCopy)
USE_JITKERNEL_REFER(kVRelu)
......@@ -32,6 +33,7 @@ USE_JITKERNEL_REFER(kMatMul)
USE_JITKERNEL_REFER(kVSquare)
USE_JITKERNEL_REFER(kHSum)
USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER(kStrideASum)
USE_JITKERNEL_REFER(kSoftmax)
USE_JITKERNEL_REFER(kEmbSeqPool)
USE_JITKERNEL_REFER(kSgd)
......
......@@ -27,6 +27,7 @@ REGISTER_REFER_KERNEL(VAddRelu);
REGISTER_REFER_KERNEL(VSub);
REGISTER_REFER_KERNEL(VScal);
REGISTER_REFER_KERNEL(StrideScal);
REGISTER_REFER_KERNEL(VAddBias);
REGISTER_REFER_KERNEL(VRelu);
......@@ -51,6 +52,7 @@ REGISTER_REFER_KERNEL(SeqPool);
REGISTER_REFER_KERNEL(MatMul);
REGISTER_REFER_KERNEL(HMax);
REGISTER_REFER_KERNEL(HSum);
REGISTER_REFER_KERNEL(StrideASum);
REGISTER_REFER_KERNEL(Softmax);
REGISTER_REFER_KERNEL(EmbSeqPool);
REGISTER_REFER_KERNEL(Sgd);
......
......@@ -411,19 +411,47 @@ void HSum(const T* x, T* res, int n) {
}
}
template <typename T>
void StrideASum(const T* x, T* res, int n, int stride) {
res[0] = x[0];
for (int i = stride; i < n; i += stride) {
res[0] += std::abs(x[i]);
}
}
template <typename T>
void StrideScal(const T* a, const T* x, T* y, int n, int stride) {
for (int i = 0; i < n; ++i) {
if (i % stride == 0) {
y[i] = x[i] * a[0];
} else {
y[i] = x[i];
}
}
}
// y = e^(x - max(x))
// y = y / sum(y)
// remain is the product of dimension shapes after the axis dimension
template <typename T>
void Softmax(const T* x, T* y, int n, int bs = 1) {
void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
for (int i = 0; i < bs; ++i) {
T scalar;
HMax(x, &scalar, n);
scalar = static_cast<T>(0) - scalar;
VAddBias(&scalar, x, y, n); // x - max
VExp(y, y, n);
HSum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
VScal(&scalar, y, y, n);
if (remain == 1) {
HSum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
VScal(&scalar, y, y, n);
} else {
for (int j = 0; j < remain; j++) {
StrideASum(&y[j], &scalar, n, remain);
scalar = static_cast<T>(1) / scalar;
StrideScal(&scalar, &y[j], &y[j], n, remain);
}
}
x += n;
y += n;
}
......@@ -507,6 +535,9 @@ DECLARE_REFER_KERNEL(VSub);
DECLARE_REFER_KERNEL(VScal);
DECLARE_REFER_KERNEL(VAddBias);
// const T* a, const T* x, T* y, int n, int stride
DECLARE_REFER_KERNEL(StrideScal);
// const T* x, T* y, int n
DECLARE_REFER_KERNEL(VRelu);
DECLARE_REFER_KERNEL(VIdentity);
......@@ -528,6 +559,8 @@ DECLARE_REFER_KERNEL(GRUHtPart2);
DECLARE_REFER_KERNEL(HMax);
DECLARE_REFER_KERNEL(HSum);
DECLARE_REFER_KERNEL(StrideASum);
// others
DECLARE_REFER_KERNEL(CRFDecoding);
DECLARE_REFER_KERNEL(LayerNorm);
......
......@@ -723,39 +723,122 @@ void TestKernelSoftmax() {
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) {
for (int m : {1, 2, 3}) { // remain
if (m > n || n % m != 0) {
continue;
}
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(bs * n), y(bs * n);
RandomVec<T>(bs * n, x.data());
const T* x_data = x.data();
T* y_data = y.data();
std::vector<T> xinp(x.size()); // inplace test
std::copy(x.begin(), x.end(), xinp.begin());
ref(x_data, y_data, n, bs, m);
T* xinp_data = xinp.data();
ref(xinp_data, xinp_data, n, bs, m);
ExpectEQ<T>(xinp_data, y_data, n * bs);
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref,
int n, int bs, int m) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(yref.size(), x.size());
EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
const T* x_data = x.data();
const T* yref_data = yref.data();
std::vector<T> ytgt(n * bs);
T* ytgt_data = ytgt.data();
// test normal
tgt(x_data, ytgt_data, n, bs, m);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
// test inplace x
std::copy(x.begin(), x.end(), ytgt.begin());
tgt(ytgt_data, ytgt_data, n, bs, m);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
};
TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs, m);
}
}
}
}
template <typename KernelTuple, typename PlaceType>
void TestKernelStrideASum() {
using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int d : TestSizes()) {
for (int m : {1, 2, 3}) { // stride
if (m > d || d % m != 0) {
continue;
}
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d);
RandomVec<T>(d, x.data());
T ref_res;
ref(x.data(), &ref_res, d, m);
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& x, const T ref_res,
const int m) {
EXPECT_TRUE(tgt != nullptr);
T tgt_res;
tgt(x.data(), &tgt_res, x.size(), m);
ExpectEQ<T>(&tgt_res, &ref_res, 1);
};
TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res, m);
}
}
}
template <typename KernelTuple, typename PlaceType>
void TestKernelStrideScal() {
using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int d : TestSizes()) {
for (int m : {1, 2, 3}) { // stride
if (m > d || d % m != 0) {
continue;
}
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(bs * n), y(bs * n);
RandomVec<T>(bs * n, x.data());
const T* x_data = x.data();
T* y_data = y.data();
std::vector<T> xinp(x.size()); // inplace test
const T a = static_cast<T>(3);
std::vector<T> x(d), yref(d);
std::vector<T> xinp(d); // inplace test
RandomVec<T>(d, x.data());
std::copy(x.begin(), x.end(), xinp.begin());
ref(x_data, y_data, n, bs);
const T* x_data = x.data();
T* yref_data = yref.data();
T* xinp_data = xinp.data();
ref(xinp_data, xinp_data, n, bs);
ExpectEQ<T>(xinp_data, y_data, n * bs);
// test refer code inplace
ref(&a, x_data, yref_data, d, m);
ref(&a, xinp_data, xinp_data, d, m);
ExpectEQ<T>(xinp_data, yref_data, d);
auto verifier = [](const typename KernelTuple::func_type tgt,
auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
const std::vector<T>& x, const std::vector<T>& yref,
int n, int bs) {
const int m) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(yref.size(), x.size());
EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
const T* x_data = x.data();
const T* yref_data = yref.data();
std::vector<T> ytgt(n * bs);
const int d = yref.size();
std::vector<T> ytgt(d);
T* ytgt_data = ytgt.data();
// test normal
tgt(x_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
tgt(&a, x_data, ytgt_data, d, m);
ExpectEQ<T>(ytgt_data, yref_data, d);
// test inplace x
std::copy(x.begin(), x.end(), ytgt.begin());
tgt(ytgt_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
tgt(&a, ytgt_data, ytgt_data, d, m);
ExpectEQ<T>(ytgt_data, yref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs);
TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref, m);
}
}
}
......@@ -912,7 +995,7 @@ TEST(JITKernel_pool, more) {
EXPECT_EQ(kers.size(), 10UL);
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_EQ(kers.size(), 21UL);
EXPECT_EQ(kers.size(), 22UL);
#else
EXPECT_EQ(kers.size(), 8UL);
#endif
......@@ -921,7 +1004,7 @@ TEST(JITKernel_pool, more) {
TEST(JITKernel_pool, refer) {
const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
EXPECT_EQ(kers.size(), 29UL);
EXPECT_EQ(kers.size(), 31UL);
}
// test helper
......@@ -1292,3 +1375,6 @@ TEST_CPU_KERNEL(MatMul);
TEST_CPU_KERNEL(Softmax);
TEST_CPU_KERNEL(Sgd);
TEST_CPU_KERNEL(VBroadcast);
TEST_CPU_KERNEL(StrideASum);
TEST_CPU_KERNEL(StrideScal);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/kldiv_loss_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class KLDivLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of KLDivLossOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Target"),
"Input(Target) of KLDivLossOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Loss"),
"Output(Loss) of KLDivLossOp should not be null.");
auto dim_x = ctx->GetInputDim("X");
auto dim_target = ctx->GetInputDim("Target");
PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
"Input(X) rank and Input(Target) rank should be same.");
for (int i = 0; i < dim_x.size(); i++) {
PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
"Input(X) and Input(Target) should in same shape.");
}
auto reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE(
"mean" == reduction || "sum" == reduction || "batchmean" == reduction ||
"none" == reduction,
"Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.");
if ("none" == reduction) {
ctx->SetOutputDim("Loss", dim_x);
} else {
ctx->SetOutputDim("Loss", {1});
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
};
class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of KL divergence loss operator. "
"This is a tensor with shape of [N, *], where N is the "
"batch size, * means any number of additional dimensions.");
AddInput("Target",
"The tensor of KL divergence loss operator. "
"This is a tensor with shape of Input(X).");
AddOutput(
"Loss",
"The output KL divergence loss tensor. if Attr(reduction) is "
"'none', this tensor should be in same shape of of Input(X), else "
"this tensor should be in shape of [1].");
AddAttr<std::string>(
"reduction",
"The reduction type to apply to the output, available types "
"are 'none' | 'batchmean' | 'mean' | 'sum', 'none' for no "
"reduction, 'batchmean' for the sum of output divided by "
"batch size, 'mean' for the average value of all output, "
"'sum' for the sum of the output.")
.SetDefault("mean");
AddComment(R"DOC(
This operator calculates the Kullback-Leibler divergence loss
between Input(X) and Input(Target).
KL divergence loss is calculated as follows:
$$l(x, y) = y * (\log(y) - x)$$
While :math:`x` is Input(X) and :math:`y` is Input(Target).
While :attr:`reduction` is :attr:`none`, output loss is in
the same shape as Input(X), loss in each point is calculated
seperately and no reduction is applied.
While :attr:`reduction` is :attr:`mean`, output loss is in
shape of [1] and loss value is the mean value of all losses.
While :attr:`reduction` is :attr:`sum`, output loss is in
shape of [1] and loss value is the sum value of all losses.
While :attr:`reduction` is :attr:`batchmean`, output loss is
in shape of [1] and loss value is the sum value of all losses
divided by batch size.
)DOC");
}
};
class KLDivLossOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Target"), "Input(Target) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null");
auto dim_x = ctx->GetInputDim("X");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
};
class KLDivLossOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("kldiv_loss_grad");
op->SetInput("X", Input("X"));
op->SetInput("Target", Input("Target"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
ops::KLDivLossOpGradMaker);
REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad);
REGISTER_OP_CPU_KERNEL(
kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
kldiv_loss_grad,
ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/kldiv_loss_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
kldiv_loss,
ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
kldiv_loss_grad,
ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>;
template <typename T>
struct KLDivLossForward {
HOSTDEVICE KLDivLossForward() {}
HOSTDEVICE T operator()(const T& target, const T& input) const {
if (target <= 0) {
return 0;
} else {
return target * (std::log(target) - input);
}
}
};
template <typename T>
struct KLDivLossBackward {
HOSTDEVICE KLDivLossBackward() {}
HOSTDEVICE T operator()(const T& target, const T& grad) const {
if (target <= 0) {
return 0;
} else {
return static_cast<T>(-1.) * grad;
}
}
};
template <typename DeviceContext, typename T>
class KLDivLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto* input = ctx.Input<Tensor>("X");
auto* target = ctx.Input<Tensor>("Target");
auto* loss = ctx.Output<Tensor>("Loss");
auto reduction = ctx.Attr<std::string>("reduction");
const int n = input->dims()[0];
loss->mutable_data<T>(ctx.GetPlace());
auto input_t = EigenVector<T>::Flatten(*input);
auto target_t = EigenVector<T>::Flatten(*target);
auto loss_t = EigenVector<T>::Flatten(*loss);
auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
if ("none" == reduction) {
loss_t.device(place) = output;
} else if ("batchmean" == reduction) {
auto output_sum = output.sum().eval();
loss_t.device(place) = output_sum / output_sum.constant(n);
} else if ("mean" == reduction) {
loss_t.device(place) = output.mean();
} else if ("sum" == reduction) {
loss_t.device(place) = output.sum();
}
}
};
template <typename DeviceContext, typename T>
class KLDivLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto* target = ctx.Input<Tensor>("Target");
auto reduction = ctx.Attr<std::string>("reduction");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
const int n = input_grad->dims()[0];
const int numel = input_grad->numel();
const int expand = numel / loss_grad->numel();
input_grad->mutable_data<T>(ctx.GetPlace());
auto target_t = EigenVector<T>::Flatten(*target);
auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
auto grad_t = target_t * loss_grad_expand;
input_grad_t.device(place) =
target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
if ("mean" == reduction) {
input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
} else if ("batchmean" == reduction) {
input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -23,15 +23,16 @@ template <typename DeviceContext, typename T, bool is_test,
typename Enable = void>
class SoftmaxFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y);
void operator()(const DeviceContext& context, const int axis_dim,
const framework::Tensor* X, framework::Tensor* Y);
};
template <typename DeviceContext, typename T>
class SoftmaxGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor* y,
const framework::Tensor* y_grad, framework::Tensor* x_grad);
void operator()(const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad);
};
#ifdef PADDLE_WITH_CUDA
......
......@@ -36,8 +36,8 @@ struct ValueClip {
template <typename DeviceContext, typename T, bool is_test, typename Enable>
void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) {
const DeviceContext& context, const int axis_dim,
const framework::Tensor* X, framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X);
auto softmax = EigenMatrix<T>::From(*Y);
......@@ -46,10 +46,13 @@ void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
const int num_remain = num_classes / axis_dim;
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
Eigen::DSizes<int, 2> one_axis(1, axis_dim);
auto shifted_logits = (logits -
logits.maximum(along_class)
......@@ -60,11 +63,11 @@ void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
softmax.device(*context.eigen_device()) = shifted_logits.exp();
softmax.device(*context.eigen_device()) = (softmax *
softmax.sum(along_class)
softmax.reshape(batch_axis_remain)
.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
.broadcast(one_axis));
}
template <class DeviceContext>
......@@ -73,8 +76,8 @@ using enable_if_CPU = typename std::enable_if<
template <typename DeviceContext>
class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) {
void operator()(const DeviceContext& context, const int axis_dim,
const framework::Tensor* X, framework::Tensor* Y) {
auto in_dims = X->dims();
const float* in_data = X->data<float>();
float* out_data = Y->data<float>();
......@@ -84,14 +87,16 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
auto compute_softmax =
jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
.At(in_dims[kClassDim]);
compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim],
in_dims[kClassDim] / axis_dim);
}
};
template <typename DeviceContext, typename T>
void SoftmaxGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context, const framework::Tensor* y,
const framework::Tensor* y_grad, framework::Tensor* x_grad) {
const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
auto softmax = EigenMatrix<T>::From(*y);
auto softmax_grad = EigenMatrix<T>::From(*y_grad);
auto logits_grad = EigenMatrix<T>::From(*x_grad);
......@@ -101,16 +106,19 @@ void SoftmaxGradFunctor<DeviceContext, T>::operator()(
const int batch_size = softmax.dimension(kBatchDim);
const int num_classes = softmax.dimension(kClassDim);
const int num_remain = num_classes / axis_dim;
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
Eigen::DSizes<int, 2> one_axis(1, axis_dim);
auto dot = (softmax * softmax_grad)
.reshape(batch_axis_remain)
.sum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class);
.broadcast(one_axis);
logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
}
......
......@@ -39,6 +39,20 @@ class SoftmaxOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SoftmaxOp should not be null.");
auto dim_x = ctx->GetInputDim("X");
auto rank_x = dim_x.size();
auto axis = ctx->Attrs().Get<int>("axis");
PADDLE_ENFORCE(axis >= -rank_x && axis < rank_x,
"Attr(axis) value should be in range [-R, R-1], "
"R is the rank of Input(X).");
auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
auto use_mkldnn = ctx->Attrs().Get<bool>("use_mkldnn");
if (axis != rank_x - 1 && axis != -1) {
PADDLE_ENFORCE(!use_cudnn, "CUDNN kernel only support axis as -1.");
PADDLE_ENFORCE(!use_mkldnn, "MKLDNN kernel only support axis as -1.");
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
......@@ -80,8 +94,12 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
AddInput("X",
"The input tensor of softmax, "
"whose last dimension is the input_feature_dimensions.");
"whose dimension :attr:`axis` is the input_feature_dimensions.");
AddOutput("Out", "The normalized values with the same shape as X.");
AddAttr<int>("axis",
"The dimension index of Input(x) to perform softmax,"
"default -1 for last dimension")
.SetDefault(-1);
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
......@@ -106,12 +124,13 @@ Softmax Operator.
The input of the softmax operator is a tensor of any rank. The output tensor
has the same shape as the input.
The input tensor will first be logically flattened to a 2-D matrix. The matrix's
second dimension(row length) is as same as the last dimension of the input
The dimension :attr:`axis` of the input tensor will be permuted to the last.
Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
second dimension(row length) is as same as the dimension :attr:`axis` of the input
tensor, and the first dimension(column length) is the product of all other
dimensions of the input tensor. For each row of the matrix, the softmax operator
squashes the K-dimensional(K is the width of the matrix, which is also the size
of the input tensor's last dimension) vector of arbitrary real values to a
of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
K-dimensional vector of real values in the range [0, 1] that add up to 1.
It computes the exponential of the given dimension and the sum of exponential
values of all the other dimensions in the K-dimensional vector input.
......
......@@ -20,6 +20,30 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
static inline int CanonicalAxis(const int axis, const int rank) {
if (axis < 0) {
return axis + rank;
}
return axis;
}
static inline int SizeToAxis(const int axis, DDim dims) {
int size = 1;
for (int i = 0; i < axis; i++) {
size *= dims[i];
}
return size;
}
static inline int SizeFromAxis(const int axis, DDim dims) {
int size = 1;
for (int i = axis; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
template <typename DeviceContext, typename T>
class SoftmaxKernel : public framework::OpKernel<T> {
......@@ -27,20 +51,27 @@ class SoftmaxKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out");
const int rank = X->dims().size();
const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
int axis_dim = X->dims()[axis];
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
int rank = X->dims().size();
Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
const int n = SizeToAxis(axis, X->dims());
const int d = SizeFromAxis(axis, X->dims());
Tensor X_2d, Out_2d;
X_2d.ShareDataWith(*X).Resize({n, d});
Out_2d.ShareDataWith(*Out).Resize({n, d});
#ifdef PADDLE_ON_INFERENCE
math::SoftmaxFunctor<DeviceContext, T, true>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
context.template device_context<DeviceContext>(), axis_dim, &X_2d,
&Out_2d);
#else
math::SoftmaxFunctor<DeviceContext, T, false>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
context.template device_context<DeviceContext>(), axis_dim, &X_2d,
&Out_2d);
#endif
}
};
......@@ -52,18 +83,23 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
auto* Out = context.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
const int rank = dX->dims().size();
const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
int axis_dim = dX->dims()[axis];
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
int rank = Out->dims().size();
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
const int n = SizeToAxis(axis, dX->dims());
const int d = SizeFromAxis(axis, dX->dims());
Tensor dX_2d, Out_2d, dOut_2d;
dX_2d.ShareDataWith(*dX).Resize({n, d});
Out_2d.ShareDataWith(*Out).Resize({n, d});
dOut_2d.ShareDataWith(*dOut).Resize({n, d});
math::SoftmaxGradFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
&dX_2d);
context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
&dOut_2d, &dX_2d);
}
};
......
......@@ -40,10 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
softmax->mutable_data<T>(context.GetPlace());
loss->mutable_data<T>(context.GetPlace());
int axis_dim = logits->dims()[logits->dims().size() - 1];
auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
dev_ctx, logits, softmax);
dev_ctx, axis_dim, logits, softmax);
math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
context.Attr<int>("ignore_index"));
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/temporal_shift_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class TemporalShiftOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of TemporalShiftOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of TemporalShiftOp should not be null.");
auto dim_x = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(dim_x.size(), 4,
"Input(X) rank should be 4 in shape of [N*T, C, H, W].");
int seg_num = ctx->Attrs().Get<int>("seg_num");
float shift_ratio = ctx->Attrs().Get<float>("shift_ratio");
PADDLE_ENFORCE_GT(seg_num, 0, "Attr(seg_num) should be greater than 0.");
PADDLE_ENFORCE(shift_ratio > 0 || shift_ratio < .5,
"Attr(shift_ratio) should be greater than 0 and less "
"than 0.5.");
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(
dim_x[0] % seg_num, 0,
"Input(X) dims[0] should be divided exactly by Attr(seg_num).");
}
ctx->SetOutputDim("Out", dim_x);
ctx->ShareLoD("X", "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
};
class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of temporal shift operator. "
"This is a 4-D tensor with shape of [N*T, C, H, W]. "
"While N is the batch size, T is the temporal segment "
"number, C is the channel number, H is the height of "
"features and W is the width of features.");
AddOutput("Out",
"The output tensor of temporal shift operator. "
"This is a 4-D tensor in the same shape with Input(X).");
AddAttr<int>("seg_num",
"The temporal segment number, this should be a positive "
"integer.");
AddAttr<float>(
"shift_ratio",
"The shift ratio of the channels, the first :attr:`shift_ratio` part "
"of channels will be shifted by -1 along the temporal dimension, "
"and the second :attr:`shift_ratio` part of channels will be shifted "
"by 1 along the temporal dimension. Default 0.25.")
.SetDefault(0.25);
AddComment(R"DOC(
This operator calculates the temporal shifting features for Input(X).
Input(X) should be in shape of [N*T, C, H, W], while N is the batch
size, T is the temporal segment number specified by :attr:`seg_num`,
C is the channel number, H and W is the height and width of features.
Temporal Shifting is calculated as follows:
Step 1: Reshape Input(X) to [N, T, C, H, W].
Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with
padding width as 1 on each side, padding result will be in shape
of [N, T+2, C, H, W].
Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding
result as follows:
$$
slice1 = x[:, :T, :C/4, :, :]
$$
$$
slice2 = x[:, 2:T+2, C/4:C/2, :, :]
$$
$$
slice3 = x[:, 1:T+1, C/2:, :, :]
$$
Step 4: Concatenate three slices along the 3rd(C) dimension and
reshape result to [N*T, C, H, W].
For details of temporal shifting, please refer to paper:
`Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
)DOC");
}
};
class TemporalShiftOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto dim_x = ctx->GetInputDim("X");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp,
ops::TemporalShiftOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel<float>,
ops::TemporalShiftKernel<double>);
REGISTER_OP_CPU_KERNEL(temporal_shift_grad, ops::TemporalShiftGradKernel<float>,
ops::TemporalShiftGradKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/temporal_shift_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using framework::Tensor;
template <typename T>
__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
const int tchw, const int chw, const int hw,
const int w, const int t, const int c,
const float shift_ratio) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int src_it = 0;
for (; tid < ntchw; tid += stride) {
int in = tid / tchw;
int it = (tid % tchw) / chw;
int ic = (tid % chw) / hw;
int ih = (tid % hw) / w;
int iw = tid % w;
const int c1 = static_cast<T>(c * shift_ratio);
const int c2 = static_cast<T>(c * 2 * shift_ratio);
if (ic < c1) {
src_it = it - 1;
} else if (ic < c2) {
src_it = it + 1;
} else {
src_it = it;
}
if (src_it < 0 || src_it >= t) {
output[tid] = 0;
} else {
int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
output[tid] = input[src_idx];
}
}
}
template <typename T>
__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
const int ntchw, const int tchw,
const int chw, const int hw, const int w,
const int t, const int c,
const float shift_ratio) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int src_it = 0;
for (; tid < ntchw; tid += stride) {
int in = tid / tchw;
int it = (tid % tchw) / chw;
int ic = (tid % chw) / hw;
int ih = (tid % hw) / w;
int iw = tid % w;
const int c1 = static_cast<T>(c * shift_ratio);
const int c2 = static_cast<T>(c * 2 * shift_ratio);
if (ic < c1) {
src_it = it - 1;
} else if (ic < c2) {
src_it = it + 1;
} else {
src_it = it;
}
if (src_it >= 0 && src_it < t) {
int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
input_grad[src_idx] = output_grad[tid];
}
}
}
template <typename T>
class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
int t = ctx.Attr<int>("seg_num");
float shift_ratio = ctx.Attr<float>("shift_ratio");
const int nt = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int hw = h * w;
const int chw = c * hw;
const int tchw = t * chw;
const int ntchw = nt * chw;
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
int pixelNum = nt * chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
KeTemporalShiftFw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
}
};
template <typename T>
class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
int t = ctx.Attr<int>("seg_num");
float shift_ratio = ctx.Attr<float>("shift_ratio");
const int nt = output_grad->dims()[0];
const int c = output_grad->dims()[1];
const int h = output_grad->dims()[2];
const int w = output_grad->dims()[3];
const int hw = h * w;
const int chw = c * hw;
const int tchw = t * chw;
const int ntchw = nt * chw;
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data =
input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T>()(
ctx.template device_context<platform::CUDADeviceContext>(), input_grad,
static_cast<T>(0));
int pixelNum = nt * chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
KeTemporalShiftBw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
shift_ratio);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(temporal_shift, ops::TemporalShiftOpCUDAKernel<float>,
ops::TemporalShiftOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(temporal_shift_grad,
ops::TemporalShiftGradOpCUDAKernel<float>,
ops::TemporalShiftGradOpCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih,
int iw, const int tchw,
const int chw, const int hw,
const int w) {
return in * tchw + it * chw + ic * hw + ih * w + iw;
}
template <typename T>
class TemporalShiftKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
int t = ctx.Attr<int>("seg_num");
float shift_ratio = ctx.Attr<float>("shift_ratio");
const int nt = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int c1 = static_cast<int>(c * shift_ratio);
const int c2 = static_cast<int>(c * 2 * shift_ratio);
const int hw = h * w;
const int chw = c * hw;
const int tchw = t * chw;
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
int src_it = 0;
for (int i = 0; i < output->numel(); i++) {
int in = i / tchw;
int it = (i % tchw) / chw;
int ic = (i % chw) / hw;
int ih = (i % hw) / w;
int iw = i % w;
if (ic < c1) {
src_it = it - 1;
} else if (ic < c2) {
src_it = it + 1;
} else {
src_it = it;
}
if (src_it < 0 || src_it >= t) {
output_data[i] = 0;
} else {
int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
output_data[i] = input_data[src_idx];
}
}
}
};
template <typename T>
class TemporalShiftGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
int t = ctx.Attr<int>("seg_num");
float shift_ratio = ctx.Attr<float>("shift_ratio");
const int nt = output_grad->dims()[0];
const int c = output_grad->dims()[1];
const int h = output_grad->dims()[2];
const int w = output_grad->dims()[3];
const int c1 = static_cast<int>(c * shift_ratio);
const int c2 = static_cast<int>(c * 2 * shift_ratio);
const int hw = h * w;
const int chw = c * hw;
const int tchw = t * chw;
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data =
input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
int src_it = 0;
for (int i = 0; i < output_grad->numel(); i++) {
int in = i / tchw;
int it = (i % tchw) / chw;
int ic = (i % chw) / hw;
int ih = (i % hw) / w;
int iw = i % w;
if (ic < c1) {
src_it = it - 1;
} else if (ic < c2) {
src_it = it + 1;
} else {
src_it = it;
}
if (src_it >= 0 && src_it < t) {
int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
input_grad_data[src_idx] = output_grad_data[i];
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -52,6 +52,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
std::string engine_key_;
std::string engine_serialized_data_;
bool calibration_mode_;
int device_id_;
public:
TensorRTEngineOp(const std::string &type,
......@@ -62,6 +63,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
input_names_ = Inputs("Xs");
max_batch_size_ = Attr<int>("max_batch_size");
workspace_size_ = Attr<int>("workspace_size");
device_id_ = Attr<int>("gpu_id");
enable_int8_ = Attr<bool>("enable_int8");
calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key");
......@@ -79,6 +81,17 @@ class TensorRTEngineOp : public framework::OperatorBase {
if (enable_int8_ && calibration_data_.size()) {
calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
}
if (!calibration_mode_ && !engine_serialized_data_.empty()) {
trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
device_id_));
PADDLE_ENFORCE(engine_serialized_data_.size(),
"TRT serialized data should not be empty here,"
"there must be error when generate serialized data in TRT "
"subgraph detect pass.");
trt_engine_->Deserialize(engine_serialized_data_);
}
}
protected:
......@@ -225,12 +238,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
if (!trt_engine_) {
trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
boost::get<platform::CUDAPlace>(dev_place).device));
if (!engine_serialized_data_.empty()) {
trt_engine_->Deserialize(engine_serialized_data_);
} else {
PrepareTRTEngine(scope, trt_engine_.get());
}
device_id_));
PrepareTRTEngine(scope, trt_engine_.get());
}
return trt_engine_.get();
}
......
......@@ -108,6 +108,8 @@ TEST(TensorRTEngineOp, manual) {
std::vector<std::string>({"z0"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......@@ -204,6 +206,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
std::vector<std::string>({"z3"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......
......@@ -67,9 +67,11 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
softmax_logits.mutable_data<T>(logits->dims(), ctx.GetPlace());
softmax_logits.set_lod(logits_lod);
int rank = logits->dims().size();
int axis_dim = logits->dims()[rank - 1];
Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1);
Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1);
math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, &in_2d, &out_2d);
math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, axis_dim, &in_2d,
&out_2d);
// ctc needs sequences data stored in transposed padding format
// logits and grad using padding data of layout 'TNC'
......
......@@ -44,9 +44,12 @@ add_subdirectory(dynload)
cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
set(dgc_deps "")
IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
set(dgc_deps dgc)
if(NOT WIN32)
set(dgc_deps dgc)
endif()
ELSE()
set(dgc_deps)
ENDIF()
......
......@@ -34,7 +34,7 @@ from . import io
from . import evaluator
from . import initializer
from . import layers
from . import imperative
from . import dygraph
from . import contrib
from . import nets
from . import optimizer
......@@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'initializer',
'layers',
'contrib',
'imperative',
'dygraph',
'transpiler',
'nets',
'optimizer',
......@@ -180,7 +180,7 @@ def __bootstrap__():
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'sync_nccl_allreduce', 'limit_of_tmp_allocation',
'times_excess_than_required_tmp_allocation',
'enable_inplace_whitelist'
'enable_inplace_whitelist', 'cudnn_batchnorm_spatial_persistent'
]
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
......
......@@ -30,6 +30,8 @@ from . import slim
from .slim import *
from . import utils
from .utils import *
from . import extend_optimizer
from .extend_optimizer import *
__all__ = []
__all__ += decoder.__all__
......@@ -40,3 +42,4 @@ __all__ += int8_inference.__all__
__all__ += reader.__all__
__all__ += slim.__all__
__all__ += utils.__all__
__all__ += extend_optimizer.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import extend_optimizer_with_weight_decay
from .extend_optimizer_with_weight_decay import *
__all__ = []
__all__ += extend_optimizer_with_weight_decay.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid
from paddle.fluid import framework as framework
__all__ = ["extend_with_decoupled_weight_decay"]
class DecoupledWeightDecay(object):
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
if not isinstance(coeff, float) and \
not isinstance(coeff, framework.Variable):
raise TypeError("coeff should be float or Variable.")
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
super(DecoupledWeightDecay, self).__init__(**kwargs)
def _scale_parameters(self, params_and_grads):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Args:
params_and_grads: A list of (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
if isinstance(self._coeff, float) and self._coeff == 0.0:
return
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue
if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
assert param.name not in self._params_name
scaled_params.append((param, grad, param * self._coeff))
self._params_name.add(param.name)
return scaled_params
def backward(self, **kargs):
return super(DecoupledWeightDecay, self).backward(**kargs)
def apply_optimize(self, **kargs):
return super(DecoupledWeightDecay, self).apply_optimize(**kargs)
def minimize(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self.apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
def extend_with_decoupled_weight_decay(base_optimizer):
"""
extend_with_decoupled_weight_decay is a decorator function, it returns an
optimizer class with decoupled weight decay. The returned optimizer will
apply weight decay on the optimized parameters with the parameters before
optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
The details of decoupled weight decay yplease refer to this
`DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
Args:
base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
Returns:
OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
Examples:
.. code-block:: python
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
fluid.optimizer.Adam)
optimizer = AdamW(learning_rate=0.1,
weight_decay=0.01)
optimizer.minimize(cost)
"""
if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
raise TypeError(
"The input(base_optimizer) should be a derived class of Optimizer.")
class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay,
base_optimizer):
"""
OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
with the parameters before optimization. For more information, please refer:
https://arxiv.org/pdf/1711.05101.pdf.
Args:
weight_decay (float|Variable): The weight decay coefficient, it can be
float or Variable.
apply_decay_param_fun (function|None): If it is not None,
only variables that makes apply_decay_param_fun(variable)==True
will be updated. It only works when we want to specify variables.
Default: None.
"""
def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
super(OptimizerWithDecoupledWeightDecay, self).__init__(
weight_decay, apply_decay_param_fun, **kwargs)
return OptimizerWithDecoupledWeightDecay
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Example:
>>from paddle.fluid.contrib.model_stat import summary
>>main_program = ...
>>summary(main_program)
+-----+------------+----------------+----------------+---------+------------+
| No. | TYPE | INPUT | OUTPUT | PARAMs | FLOPs |
+-----+------------+----------------+----------------+---------+------------+
| 0 | conv2d | (3, 200, 200) | (64, 100, 100) | 9408 | 188160000 |
| 1 | batch_norm | (64, 100, 100) | (64, 100, 100) | 256 | 640000 |
| 2 | relu | (64, 100, 100) | (64, 100, 100) | 0 | 640000 |
| 3 | pool2d | (64, 100, 100) | (64, 50, 50) | 0 | 1440000 |
...
| 176 | conv2d | (512, 7, 7) | (512, 7, 7) | 2359296 | 231211008 |
| 177 | relu | (512, 7, 7) | (512, 7, 7) | 0 | 25088 |
| 178 | conv2d | (512, 7, 7) | (2048, 7, 7) | 1048576 | 102760448 |
| 179 | relu | (2048, 7, 7) | (2048, 7, 7) | 0 | 100352 |
| 180 | pool2d | (2048, 7, 7) | (2048, 1, 1) | 0 | 100352 |
+-----+------------+----------------+----------------+---------+------------+
Total PARAMs: 48017344(0.0480G)
Total FLOPs: 11692747751(11.69G)
'''
from collections import OrderedDict
from prettytable import PrettyTable
def summary(main_prog):
'''
It can summary model's PARAMS, FLOPs until now.
It support common operator like conv, fc, pool, relu, sigmoid, bn etc.
Args:
main_prog: main program
Returns:
print summary on terminal
'''
collected_ops_list = []
for one_b in main_prog.blocks:
block_vars = one_b.vars
for one_op in one_b.ops:
op_info = OrderedDict()
spf_res = _summary_model(block_vars, one_op)
if spf_res is None:
continue
# TODO: get the operator name
op_info['type'] = one_op.type
op_info['input_shape'] = spf_res[0][1:]
op_info['out_shape'] = spf_res[1][1:]
op_info['PARAMs'] = spf_res[2]
op_info['FLOPs'] = spf_res[3]
collected_ops_list.append(op_info)
summary_table, total = _format_summary(collected_ops_list)
_print_summary(summary_table, total)
def _summary_model(block_vars, one_op):
'''
Compute operator's params and flops.
Args:
block_vars: all vars of one block
one_op: one operator to count
Returns:
in_data_shape: one operator's input data shape
out_data_shape: one operator's output data shape
params: one operator's PARAMs
flops: : one operator's FLOPs
'''
if one_op.type in ['conv2d', 'depthwise_conv2d']:
k_arg_shape = block_vars[one_op.input("Filter")[0]].shape
in_data_shape = block_vars[one_op.input("Input")[0]].shape
out_data_shape = block_vars[one_op.output("Output")[0]].shape
c_out, c_in, k_h, k_w = k_arg_shape
_, c_out_, h_out, w_out = out_data_shape
assert c_out == c_out_, 'shape error!'
k_groups = one_op.attr("groups")
kernel_ops = k_h * k_w * (c_in / k_groups)
bias_ops = 0 if one_op.input("Bias") == [] else 1
params = c_out * (kernel_ops + bias_ops)
flops = h_out * w_out * c_out * (kernel_ops + bias_ops)
# base nvidia paper, include mul and add
flops = 2 * flops
elif one_op.type == 'pool2d':
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
_, c_out, h_out, w_out = out_data_shape
k_size = one_op.attr("ksize")
params = 0
flops = h_out * w_out * c_out * (k_size[0] * k_size[1])
elif one_op.type == 'mul':
k_arg_shape = block_vars[one_op.input("Y")[0]].shape
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
# TODO: fc has mul ops
# add attr to mul op, tell us whether it belongs to 'fc'
# this's not the best way
if 'fc' not in one_op.output("Out")[0]:
return None
k_in, k_out = k_arg_shape
# bias in sum op
params = k_in * k_out + 1
flops = k_in * k_out
elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']:
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
params = 0
if one_op.type == 'prelu':
params = 1
flops = 1
for one_dim in in_data_shape:
flops *= one_dim
elif one_op.type == 'batch_norm':
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Y")[0]].shape
_, c_in, h_out, w_out = in_data_shape
# gamma, beta
params = c_in * 2
# compute mean and std
flops = h_out * w_out * c_in * 2
else:
return None
return in_data_shape, out_data_shape, params, flops
def _format_summary(collected_ops_list):
'''
Format summary report.
Args:
collected_ops_list: the collected operator with summary
Returns:
summary_table: summary report format
total: sum param and flops
'''
summary_table = PrettyTable(
["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
summary_table.align = 'r'
total = {}
total_params = []
total_flops = []
for i, one_op in enumerate(collected_ops_list):
# notice the order
table_row = [
i,
one_op['type'],
one_op['input_shape'],
one_op['out_shape'],
int(one_op['PARAMs']),
int(one_op['FLOPs']),
]
summary_table.add_row(table_row)
total_params.append(int(one_op['PARAMs']))
total_flops.append(int(one_op['FLOPs']))
total['params'] = total_params
total['flops'] = total_flops
return summary_table, total
def _print_summary(summary_table, total):
'''
Print all the summary on terminal.
Args:
summary_table: summary report format
total: sum param and flops
'''
parmas = total['params']
flops = total['flops']
print(summary_table)
print('Total PARAMs: {}({:.4f}M)'.format(
sum(parmas), sum(parmas) / (10**6)))
print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9))
print(
"Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]"
)
......@@ -19,7 +19,7 @@ from .... import Program
from .... import program_guard
from .... import regularizer
__all__ = ['FSPDistiller', 'L2Distiller']
__all__ = ['FSPDistiller', 'L2Distiller', 'SoftLabelDistiller']
class L2Distiller(object):
......@@ -186,3 +186,91 @@ class FSPDistillerPass(object):
def _fsp_matrix(self, fea_map_0, fea_map_1):
return layers.fsp_matrix(fea_map_0, fea_map_1)
class SoftLabelDistiller(object):
"""
Combine two layers from student net and teacher net by softmax_with_cross_entropy loss.
And add the loss into the total loss using for distillation training.
"""
def __init__(self,
student_feature_map=None,
teacher_feature_map=None,
student_temperature=1.0,
teacher_temperature=1.0,
distillation_loss_weight=1):
"""
Args:
student_feature_map(str): The name of feature map from student network.
teacher_feature_map(str): The name of feature map from teacher network.
It's shape should be the same with student network.
student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy. default: 1.0
teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy. default: 1.0
distillation_loss_weight(float): The weight of the l2-loss.
"""
self.student_feature_map = student_feature_map
self.teacher_feature_map = teacher_feature_map
self.distillation_loss_weight = distillation_loss_weight
self.student_temperature = student_temperature
self.teacher_temperature = teacher_temperature
def distiller_loss(self, graph):
"""
Modify graph inplace to add softmax_with_cross_entropy loss.
Args:
graph(GraphWrapper): The graph to be modified.
Returns:
GraphWrapper: The modified graph.
"""
distiller_pass = SoftLabelDistillerPass(
self.student_feature_map, self.teacher_feature_map,
self.student_temperature, self.teacher_temperature,
self.distillation_loss_weight)
dis_graph = distiller_pass.apply(graph)
return dis_graph
class SoftLabelDistillerPass(object):
def __init__(self,
student_feature_map,
teacher_feature_map,
student_temperature,
teacher_temperature,
distillation_loss_weight=1):
"""
Args:
student_feature_map(str): The name of feature map from student network.
teacher_feature_map(str): The name of feature map from teacher network.
It's shape should be the same with student network.
student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy.
teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy.
distillation_loss_weight(float): The weight of the l2-loss.
"""
self.student_feature_map = student_feature_map
self.teacher_feature_map = teacher_feature_map
self.student_temperature = student_temperature
self.teacher_temperature = teacher_temperature
self.distillation_loss_weight = distillation_loss_weight
def apply(self, graph):
ret_graph = graph
with program_guard(ret_graph.program):
student_feature_map = ret_graph.var(self.student_feature_map)._var
teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
s_fea = student_feature_map / self.student_temperature
t_fea = teacher_feature_map / self.distillation_loss_weight
t_fea.stop_gradient = True
ce_loss = layers.softmax_with_cross_entropy(
s_fea, t_fea, soft_label=True)
distillation_loss = ce_loss * self.distillation_loss_weight
student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
loss = distillation_loss + student_loss
ret_graph.out_nodes[
'soft_label_loss_' + self.student_feature_map + "_" +
self.teacher_feature_map] = distillation_loss.name
ret_graph.out_nodes['loss'] = loss.name
return ret_graph
......@@ -204,6 +204,10 @@ class GraphWrapper(object):
"""
super(GraphWrapper, self).__init__()
self.program = Program() if program is None else program
self.persistables = {}
for var in self.program.list_vars():
if var.persistable:
self.persistables[var.name] = var
self.compiled_graph = None
self.in_nodes = OrderedDict(in_nodes)
self.out_nodes = OrderedDict(out_nodes)
......@@ -467,7 +471,12 @@ class GraphWrapper(object):
path(str): The path to save the persistables.
exe(framework.Executor): The executor used to save the persistables.
"""
io.save_persistables(exe.exe, path, main_program=self.program)
# update persistables from program
for var in self.program.list_vars():
if var.persistable and var.name not in self.persistables:
self.persistables[var.name] = var
io.save_vars(exe.exe, path, vars=self.persistables.values())
def load_persistables(self, path, exe):
"""
......@@ -481,7 +490,7 @@ class GraphWrapper(object):
return os.path.exists(os.path.join(path, var.name))
io.load_vars(
exe.exe, path, main_program=self.program, predicate=if_exist)
exe.exe, path, vars=self.persistables.values(), predicate=if_exist)
def update_param_shape(self, scope):
"""
......
......@@ -20,7 +20,7 @@ from .... import io
from .... import core
from ....compiler import CompiledProgram
from ....compiler import BuildStrategy
from ....framework import IrGraph
from ....framework import IrGraph, Variable, Program
from ..core.strategy import Strategy
from .quantization_pass import *
......@@ -88,41 +88,76 @@ class QuantizationStrategy(Strategy):
self.save_out_nodes = save_out_nodes
self.save_in_nodes = save_in_nodes
def on_compression_begin(self, context):
"""
Restore graph when the compressoin task is inited from checkpoint.
"""
# It is inited from checkpoint and has missed start epoch.
if context.epoch_id != 0 and context.epoch_id > self.start_epoch:
_logger.info("Restore quantization task from checkpoint")
self._modify_graph_for_quantization(context)
_logger.info("Finish restoring quantization task from checkpoint")
def _modify_graph_for_quantization(self, context):
"""
Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
"""
train_ir_graph = IrGraph(
core.Graph(context.optimize_graph.program.clone().desc),
for_test=False)
test_ir_graph = IrGraph(
core.Graph(context.eval_graph.program.clone().desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits,
activation_quantize_type=self.activation_quantize_type,
weight_quantize_type=self.weight_quantize_type)
transform_pass.apply(train_ir_graph)
transform_pass.apply(test_ir_graph)
# Put persistables created by transform_pass into context.optimize_graph.persistables
# for saving checkpoint.
program_persistables = set()
for var in context.optimize_graph.program.list_vars():
if var.persistable:
program_persistables.add(var.name)
program = Program()
for var_node in train_ir_graph.all_persistable_nodes():
if var_node.name() not in program_persistables:
var_desc = var_node.var()
var = program.global_block().create_var(
name=var_node.name(),
shape=var_desc.shape(),
dtype=var_desc.dtype(),
type=var_desc.type(),
lod_level=var_desc.lod_level())
context.optimize_graph.persistables[var.name] = var
build_strategy = BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
# for quantization training
context.optimize_graph.compiled_graph = CompiledProgram(
train_ir_graph.graph).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'],
build_strategy=build_strategy)
# for evaluation. And program compiled from ir graph must be with data parallel.
context.eval_graph.compiled_graph = CompiledProgram(
test_ir_graph.graph).with_data_parallel(
build_strategy=build_strategy)
# for saving inference model after training
context.put('quantization_test_ir_graph_backup', test_ir_graph)
def on_epoch_begin(self, context):
"""
Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
"""
super(QuantizationStrategy, self).on_compression_begin(context)
super(QuantizationStrategy, self).on_epoch_begin(context)
if self.start_epoch == context.epoch_id:
_logger.info('QuantizationStrategy::on_epoch_begin')
train_ir_graph = IrGraph(
core.Graph(context.optimize_graph.program.desc), for_test=False)
test_ir_graph = IrGraph(
core.Graph(context.eval_graph.program.desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits,
activation_quantize_type=self.activation_quantize_type,
weight_quantize_type=self.weight_quantize_type)
transform_pass.apply(train_ir_graph)
transform_pass.apply(test_ir_graph)
build_strategy = BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
# for quantization training
context.optimize_graph.compiled_graph = CompiledProgram(
train_ir_graph.graph).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'],
build_strategy=build_strategy)
# for evaluation. And program compiled from ir graph must be with data parallel.
context.eval_graph.compiled_graph = CompiledProgram(
test_ir_graph.graph).with_data_parallel(
build_strategy=build_strategy)
# for saving inference model after training
context.put('quantization_test_ir_graph_backup', test_ir_graph)
self._modify_graph_for_quantization(context)
_logger.info('Finish QuantizationStrategy::on_epoch_begin')
def on_epoch_end(self, context):
......
......@@ -33,10 +33,17 @@ distillers:
teacher_feature_map: 'teacher.tmp_2'
student_feature_map: 'student.tmp_2'
distillation_loss_weight: 1
soft_label_distiller:
class: 'SoftLabelDistiller'
student_temperature: 1.0
teacher_temperature: 1.0
teacher_feature_map: 'teacher.tmp_1'
student_feature_map: 'student.tmp_1'
distillation_loss_weight: 0.001
strategies:
distillation_strategy:
class: 'DistillationStrategy'
distillers: ['fsp_distiller', 'l2_distiller']
distillers: ['fsp_distiller', 'l2_distiller', 'soft_label_distiller']
start_epoch: 0
end_epoch: 1
compressor:
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from functools import partial
import numpy as np
import paddle
import paddle.fluid as fluid
import contextlib
def get_places():
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
return places
@contextlib.contextmanager
def prog_scope_guard(main_prog, startup_prog):
scope = fluid.core.Scope()
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
with fluid.program_guard(main_prog, startup_prog):
yield
def bow_net(data,
label,
dict_dim,
is_sparse=False,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
emb = fluid.layers.embedding(
input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
class TestWeightDecay(unittest.TestCase):
def setUp(self):
self.word_dict = paddle.dataset.imdb.word_dict()
reader = paddle.batch(
paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
self.train_data = [next(reader) for _ in range(5)]
self.learning_rate = .5
def run_program(self, place, feed_list):
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe.run(fluid.default_startup_program())
main_prog = fluid.default_main_program()
param_list = [var.name for var in main_prog.block(0).all_parameters()]
param_sum = []
for data in self.train_data:
out = exe.run(main_prog,
feed=feeder.feed(data),
fetch_list=param_list)
p_sum = 0
for v in out:
p_sum += np.sum(np.abs(v))
param_sum.append(p_sum)
return param_sum
def check_weight_decay(self, place, model):
main_prog = fluid.framework.Program()
startup_prog = fluid.framework.Program()
startup_prog.random_seed = 1
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
avg_cost = model(data, label, len(self.word_dict))
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
fluid.optimizer.Adam)
optimizer = AdamW(
learning_rate=self.learning_rate,
weight_decay=self.learning_rate)
optimizer.minimize(avg_cost)
param_sum = self.run_program(place, [data, label])
return param_sum
def check_weight_decay2(self, place, model):
main_prog = fluid.framework.Program()
startup_prog = fluid.framework.Program()
startup_prog.random_seed = 1
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
avg_cost = model(data, label, len(self.word_dict))
param_list = [(var, var * self.learning_rate)
for var in main_prog.block(0).all_parameters()]
optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
optimizer.minimize(avg_cost)
for params in param_list:
updated_p = fluid.layers.elementwise_sub(
x=params[0], y=params[1])
fluid.layers.assign(input=updated_p, output=params[0])
param_sum = self.run_program(place, [data, label])
return param_sum
def test_weight_decay(self):
for place in get_places():
model = partial(bow_net, is_sparse=False)
param_sum1 = self.check_weight_decay(place, model)
param_sum2 = self.check_weight_decay2(place, model)
for i in range(len(param_sum1)):
assert np.isclose(a=param_sum1[i], b=param_sum2[i], rtol=5e-5)
if __name__ == '__main__':
unittest.main()
......@@ -32,6 +32,9 @@ from .profiler import *
from . import checkpoint
from .checkpoint import *
from . import learning_rate_scheduler
from .learning_rate_scheduler import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
......@@ -39,3 +42,4 @@ __all__ += nn.__all__
__all__ += tracer.__all__
__all__ += profiler.__all__
__all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__
......@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
def enabled():
return framework._in_imperative_mode()
return framework._in_dygraph_mode()
@signature_safe_contextmanager
......@@ -39,14 +39,14 @@ def guard(place=None):
with framework.program_guard(train, startup):
with framework.unique_name.guard():
with framework._imperative_guard(tracer):
with framework._imperative_place_guard(place):
with framework._dygraph_guard(tracer):
with framework._dygraph_place_guard(place):
yield
def to_variable(value, block=None, name=None):
if isinstance(value, np.ndarray):
assert enabled(), "to_variable could only be called in imperative mode"
assert enabled(), "to_variable could only be called in dygraph mode"
if not block:
block = framework.default_main_program().current_block()
......
......@@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None):
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
param_path = "./my_paddle_model"
fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path,
fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path,
layer=ptb_model)
"""
if isinstance(vardict, collections.OrderedDict):
......@@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None):
Examples:
.. code-block:: python
my_layer = layer(fluid.imperative.Layer)
my_layer = layer(fluid.dygraph.Layer)
param_path = "./my_paddle_model"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path)
param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
param_1 = param_dict['PtbModel_0.w_1']
or:
my_layer = layer(fluid.imperative.Layer)
my_layer = layer(fluid.dygraph.Layer)
param_path = "./my_paddle_model"
filename = "model.file"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path,
param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
filename=filename)
param_1 = param_dict['PtbModel_0.w_1']
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import copy
import six
from ..framework import Parameter, _in_imperative_mode
from ..framework import Parameter, _in_dygraph_mode
from ..param_attr import ParamAttr
from .. import core
from six.moves import zip
......
......@@ -283,7 +283,7 @@ class PyLayer(core.PyLayer):
@classmethod
def __call__(cls, *inputs):
tracer = framework._imperative_tracer()
tracer = framework._dygraph_tracer()
block = framework.default_main_program().current_block()
ivar_inputs = [x._ivar for x in inputs]
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
from .. import unique_name
__all__ = [
'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
]
class LearningRateDecay(object):
"""
Base class of learning rate decay
"""
def __init__(self, begin=0, step=1, dtype='float32'):
self.step_num = begin
self.step_size = step
self.dtype = dtype
def __call__(self):
lr = self.step()
if isinstance(lr, float):
lr = self.create_lr_var(lr)
self.step_num += self.step_size
return lr
def create_lr_var(self, lr):
from .. import layers
lr = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(lr),
dtype=self.dtype,
persistable=True)
return lr
def step(self):
raise NotImplementedError()
class PiecewiseDecay(LearningRateDecay):
def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
super(PiecewiseDecay, self).__init__(begin, step, dtype)
self.boundaries = boundaries
self.values = values
self.vars = []
for value in values:
self.vars.append(self.create_lr_var(value))
def step(self):
for i in range(len(self.boundaries)):
if self.step_num < self.boundaries[i]:
return self.vars[i]
return self.vars[len(self.values) - 1]
class NaturalExpDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(NaturalExpDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate *
div_res)
return decayed_lr
class ExponentialDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(ExponentialDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate * (self.decay_rate**div_res)
return decayed_lr
class InverseTimeDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(InverseTimeDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
return decayed_lr
class PolynomialDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(PolynomialDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
from .. import layers
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(tmp_step_num
if tmp_step_num < self.decay_steps
else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class CosineDecay(LearningRateDecay):
def __init__(self,
learning_rate,
step_each_epoch,
epochs,
begin=0,
step=1,
dtype='float32'):
super(CosineDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.step_each_epoch = step_each_epoch
self.epochs = epochs
def step(self):
from .. import layers
cur_epoch = layers.floor(
self.create_lr_var(self.step_num / self.step_each_epoch))
decayed_lr = self.learning_rate * 0.5 * (
layers.cos(cur_epoch * math.pi / self.epochs) + 1)
return decayed_lr
class NoamDecay(LearningRateDecay):
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
super(NoamDecay, self).__init__(begin, step, dtype)
self.d_model = d_model
self.warmup_steps = warmup_steps
def step(self):
from .. import layers
a = self.create_lr_var(self.step_num**-0.5)
b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
return lr_value
......@@ -133,7 +133,7 @@ class Conv2D(layers.Layer):
outputs={'Out': [pre_act]},
attrs={'axis': 1})
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(pre_act, act=self._act)
......@@ -265,7 +265,7 @@ class FC(layers.Layer):
attrs={'axis': self._num_flatten_dims})
else:
pre_activation = pre_bias
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(pre_activation, act=self._act)
......@@ -387,7 +387,7 @@ class BatchNorm(layers.Layer):
"use_global_stats": self._use_global_stats
})
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(batch_norm_out, self._act)
......@@ -426,7 +426,7 @@ class Embedding(layers.Layer):
dict_size = len(dataset.ids)
input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
embedding = fluid.imperative.Embedding(size=[dict_size, 16])
embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
fc = embedding(input)
"""
......
......@@ -24,12 +24,12 @@ __all__ = ['Tracer']
def release_op(op):
del framework._imperative_tracer()._ops[op._trace_id]
del framework._dygraph_tracer()._ops[op._trace_id]
class Tracer(core.Tracer):
"""
Python wrapper of imperative tracer
Python wrapper of dygraph tracer
"""
def __init__(self, block):
......
......@@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
_imperative_tracer_ = None
_imperative_current_expected_place_ = None
_dygraph_tracer_ = None
_dygraph_current_expected_place_ = None
def _in_imperative_mode():
return _imperative_tracer_ is not None
def _in_dygraph_mode():
return _dygraph_tracer_ is not None
def _imperative_tracer():
return _imperative_tracer_
def _dygraph_tracer():
return _dygraph_tracer_
def _current_expected_place():
return _imperative_current_expected_place_
return _dygraph_current_expected_place_
def _cpu_num():
......@@ -396,7 +396,7 @@ class Variable(object):
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
if _in_imperative_mode():
if _in_dygraph_mode():
# record vars in tracer rather than blocks
self._ivar = kwargs.get("ivar", None)
if not self._ivar:
......@@ -406,7 +406,7 @@ class Variable(object):
_current_expected_place(), stop_gradient, True
if persistable else False)
if persistable:
_imperative_tracer().trace_var(name, self)
_dygraph_tracer().trace_var(name, self)
else:
self.error_clip = error_clip
......@@ -515,8 +515,8 @@ class Variable(object):
Returns:
str: The debug string.
"""
if _in_imperative_mode():
# TODO(panyx0718): add more imperative debug info.
if _in_dygraph_mode():
# TODO(panyx0718): add more dygraph debug info.
return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
self.shape)
......@@ -548,42 +548,42 @@ class Variable(object):
@property
def _stop_gradient(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.stop_gradient
else:
return self.stop_gradient
@_stop_gradient.setter
def _stop_gradient(self, s):
if _in_imperative_mode():
if _in_dygraph_mode():
self._ivar.stop_gradient = s
else:
self.stop_gradient = s
@property
def persistable(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.persistable
else:
return self.desc.persistable()
@persistable.setter
def persistable(self, p):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.persistable
else:
self.desc.set_persistable(p)
@property
def name(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.name
else:
return cpt.to_text(self.desc.name())
@name.setter
def name(self, new_name):
if _in_imperative_mode():
if _in_dygraph_mode():
self._ivar.name = new_name
else:
self.desc.set_name(new_name)
......@@ -591,26 +591,26 @@ class Variable(object):
@property
def shape(self):
# convert to tuple, make it as same as numpy API.
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.shape
else:
return tuple(self.desc.shape())
@property
def dtype(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.dtype
else:
return self.desc.dtype()
@property
def lod_level(self):
# TODO(minqiyang): Support lod_level in imperative mode
# TODO(minqiyang): Support lod_level in dygraph mode
return self.desc.lod_level()
@property
def type(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self._ivar.dtype
else:
return self.desc.type()
......@@ -789,13 +789,24 @@ class Variable(object):
if isinstance(item, tuple):
if len(item) > len(self.shape):
raise IndexError("Too many indexes")
fixedSize = True
for i in range(len(self.shape)):
if self.shape[i] == -1:
fixedSize = False
break
newitem = self._reconstructSliceinfo(item) or item
check, info = self._detectContinuesSlice(newitem)
if check:
starts = info[0]
ends = info[1]
axes = [i for i in range(len(starts))]
return self._sliceVar(axes, starts, ends)
if fixedSize:
check, info = self._detectContinuesSlice(newitem)
if check:
starts = info[0]
ends = info[1]
axes = [i for i in range(len(starts))]
return self._sliceVar(axes, starts, ends)
else:
new_var = self
for index, o in enumerate(newitem):
new_var = new_var._sliceAndConcatVar(o, index)
else:
new_var = self
for index, o in enumerate(newitem):
......@@ -918,7 +929,7 @@ class Operator(object):
inputs=None,
outputs=None,
attrs=None):
if _in_imperative_mode():
if _in_dygraph_mode():
if type is None:
raise ValueError(
"`type` to initialized an Operator can not be None.")
......@@ -1037,7 +1048,7 @@ class Operator(object):
for arg in out_args:
out_arg_names.append(cpt.to_text(arg.name))
# TODO(minqiyang): could we remove variable's op in static mode?
if not _in_imperative_mode():
if not _in_dygraph_mode():
arg.op = self
self.desc.set_output(out_proto.name, out_arg_names)
......@@ -1083,7 +1094,7 @@ class Operator(object):
@property
def type(self):
if _in_imperative_mode():
if _in_dygraph_mode():
return self.iop.type
else:
return self.desc.type()
......@@ -1626,7 +1637,7 @@ class Block(object):
Returns:
Operator: the append Operator.
"""
if _in_imperative_mode():
if _in_dygraph_mode():
op = Operator(
block=self,
desc=None,
......@@ -1638,9 +1649,8 @@ class Block(object):
# record ops in tracer rather than blocks
#
# TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in imperative mode.
_imperative_tracer().trace_op(op,
kwargs.get("stop_gradient", False))
# currently, we only support stop_gradient in dygraph mode.
_dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
else:
op_desc = self.desc.append_op()
op = Operator(
......@@ -1699,7 +1709,7 @@ class Block(object):
return self.ops[start:end]
def _prepend_op(self, *args, **kwargs):
if _in_imperative_mode():
if _in_dygraph_mode():
op = Operator(
self,
None,
......@@ -1707,8 +1717,7 @@ class Block(object):
inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None))
_imperative_tracer().trace_op(op,
kwargs.get("stop_gradient", False))
_dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
else:
op_desc = self.desc._prepend_op()
op = Operator(
......@@ -3511,22 +3520,22 @@ def _get_var(name, program=None):
@signature_safe_contextmanager
def _imperative_guard(tracer):
global _imperative_tracer_
tmp_trace = _imperative_tracer_
_imperative_tracer_ = tracer
def _dygraph_guard(tracer):
global _dygraph_tracer_
tmp_trace = _dygraph_tracer_
_dygraph_tracer_ = tracer
yield
_imperative_tracer_ = tmp_trace
_dygraph_tracer_ = tmp_trace
@signature_safe_contextmanager
def _imperative_place_guard(place):
global _imperative_current_expected_place_
tmp_place = _imperative_current_expected_place_
_imperative_current_expected_place_ = place
def _dygraph_place_guard(place):
global _dygraph_current_expected_place_
tmp_place = _dygraph_current_expected_place_
_dygraph_current_expected_place_ = place
yield
_imperative_current_expected_place_ = tmp_place
_dygraph_current_expected_place_ = tmp_place
此差异已折叠。
......@@ -17,7 +17,7 @@ from .param_attr import ParamAttr
from .initializer import Constant
from . import layers
from . import backward
from .imperative import Layer, nn
from .dygraph import Layer, nn
from . import executor
from . import core
......
......@@ -17,7 +17,7 @@ from __future__ import print_function
import copy
import six
from .framework import Parameter, dtype_is_floating, _in_imperative_mode
from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
from . import unique_name
from paddle.fluid.initializer import Constant, Xavier
from .param_attr import ParamAttr
......@@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase):
def __init__(self, layer_type, **kwargs):
self.kwargs = kwargs
name = self.kwargs.get('name', None)
# TODO(panyx0718, minqiyang): imperative mode
# TODO(panyx0718, minqiyang): dygraph mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode.
# and write a Helper for dygraph mode.
if name is None:
self.kwargs['name'] = unique_name.generate(layer_type)
......
......@@ -929,9 +929,9 @@ def array_read(array, i):
Examples:
.. code-block:: python
tmp = fluid.layers.zeros(shape=[10], dtype='int32')
array = fluid.layers.create_array(dtype='float32')
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
arr = layers.array_read(tmp, i=i)
item = fluid.layers.array_read(array, i)
"""
helper = LayerHelper('array_read', **locals())
if not isinstance(
......
此差异已折叠。
......@@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_
from ..framework import Variable
from ..initializer import Constant, force_init_on_cpu
from ..core import VarDesc
from ..imperative import base as imperative_base
from .layer_function_generator import templatedoc
import numpy
......
此差异已折叠。
......@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
}
def test_check_output(self):
self.check_output(atol=1e-8, check_imperative=True)
self.check_output(atol=1e-8, check_dygraph=True)
def test_check_grad(self):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册