diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 380edb9164e4f40fb03755bf5d17f70a0ff7cb53..7bc68b431ab11ded5e8197875709acb02fda0d8c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -213,7 +213,8 @@ def ParseYamlArgs(string): default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None - assert arg_type in yaml_types_mapping.keys(), arg_type + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." arg_type = yaml_types_mapping[arg_type] arg_name = RemoveSpecialSymbolsInName(arg_name) @@ -248,7 +249,8 @@ def ParseYamlReturns(string): else: ret_type = ret.strip() - assert ret_type in yaml_types_mapping.keys(), ret_type + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index b4b19c52a348be960758a50926803c9ed669eef6..aba3e227ab4b3c52f423ea581a502589fa93f416 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -25,7 +25,7 @@ atype_to_parsing_function = { "std::string": "CastPyArg2String", "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", - "string": "CastPyArg2String", + "std::string": "CastPyArg2String", "std::vector": "CastPyArg2Booleans", "std::vector": "CastPyArg2Ints", "std::vector": "CastPyArg2Longs", diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 18068e22b7f3c31d59636bc7ab6a234e109d5ee6..164a13d1560f4d0008c2bdb5a56d8ad6f875157b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() { return output_var; } -PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { - auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) - ->assert_is_op("elementwise_add"); - - x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); - y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); - auto out_var = pattern->NewNode(elementwise_add_out_repr()) +PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, + const std::string elementwise_type) { + auto elementwise_op = + pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); + + x_var->AsInput()->assert_is_op_input(elementwise_type, "X"); + y_var->AsInput()->assert_is_op_input(elementwise_type, "Y"); + auto out_var = pattern->NewNode(elementwise_out_repr()) ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output(elementwise_type, "Out"); - elementwise_add_op->LinksFrom({x_var, y_var}); - elementwise_add_op->LinksTo({out_var}); + elementwise_op->LinksFrom({x_var, y_var}); + elementwise_op->LinksTo({out_var}); return out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 062d2f9dedce65f6e16b70f0b201a4ca63b0531a..17c70ace301d39db6fcf14d01c11baab0dc7d403 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1016,20 +1016,20 @@ struct Pool : public PatternBase { PATTERN_DECL_NODE(pool_output); }; -// ElementwiseAdd used in residual connections. -// y_var is used and convolution output. -// The operator is removed, when residual -// connection fusion is on. -struct ElementwiseAdd : public PatternBase { - ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add") {} - - PDNode* operator()(PDNode* x_var, PDNode* y_var); - - PATTERN_DECL_NODE(elementwise_add_op); - PATTERN_DECL_NODE(elementwise_add_x); - PATTERN_DECL_NODE(elementwise_add_y); - PATTERN_DECL_NODE(elementwise_add_out); +// Elementwise ops +// Forward pass for element-wise operators (add, mul) +// elementwise_mul_out is the result of the operator +struct Elementwise : public PatternBase { + Elementwise(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise") {} + + PDNode* operator()(PDNode* x_var, PDNode* y_var, + const std::string elementwise_type); + + PATTERN_DECL_NODE(elementwise_op); + PATTERN_DECL_NODE(elementwise_x); + PATTERN_DECL_NODE(elementwise_y); + PATTERN_DECL_NODE(elementwise_out); }; // Transpose op diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 0f3f37320b026a7100bd050c1a01b6683765a44f..fc2758c27345032c1ad0831b4ee0016fa84b3f5c 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - conv_output, - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_x_count = 0; @@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + if (!IsReachable(g, elementwise_identity, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_x_count++; }; @@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - conv_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output, + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_y_count = 0; @@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_x, conv_output)) return; + if (!IsReachable(g, elementwise_x, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_x, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_y_count++; }; @@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( patterns::Conv conv_y_pattern{pattern, name_scope}; auto conv_y_output = conv_y_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern(conv_x_output, conv_y_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add"); conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); @@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!IsCompat(subgraph, g)) { LOG(WARNING) @@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( return; } - if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; - if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return; Node* projection_node; Node* residual_conv_op; @@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( if (HasFusedActivation(residual_conv_op)) return; residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op}); IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + IR_NODE_LINK_TO(residual_conv_op, elementwise_out); found_projection_conv_count++; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 371482b5343d638f005aa8e0700680b6ac00d6ec..f4358fb243f20bc9b024ef6b02768773fa995f45 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); } -void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { +void CPUQuantizePass::QuantizeElementwise( + Graph* graph, const std::string elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + patterns::Elementwise elementwise_pattern{pattern, name_scope_}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), + pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + elementwise_type); - int quantize_elementwise_add_count = 0; + int quantize_elementwise_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize elementwise_add op"; - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); + VLOG(4) << "Quantize " + elementwise_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) { - LogQuantizationDisabled(elementwise_add_op); + if (!platform::HasOpINT8DataType(elementwise_op->Op())) { + LogQuantizationDisabled(elementwise_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!AreScalesPresentForNodes( - {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { - LogCannotQuantizeOp(elementwise_add_op, + {elementwise_x, elementwise_y, elementwise_out})) { + LogCannotQuantizeOp(elementwise_op, "No scale available for the operator"); return; } bool is_x_unsigned{false}, is_y_unsigned{false}; - auto input_x_scale = - GetScaleValueForNode(elementwise_add_x, &is_x_unsigned); - auto input_y_scale = - GetScaleValueForNode(elementwise_add_y, &is_y_unsigned); + auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned); // TODO(sfraczek): add support for different signness if (is_x_unsigned != is_y_unsigned) { - LogCannotQuantizeOp(elementwise_add_op, - "ElementwiseAdd inputs must be of the same type."); + LogCannotQuantizeOp(elementwise_op, + "Elementwise inputs must be of the same type."); return; } - QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale, + QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale, is_x_unsigned, "Scale_x"); - QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, + QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + GetScaleValueForNode(elementwise_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); + DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); - ++quantize_elementwise_add_count; + ++quantize_elementwise_count; }; gpd(graph, handler); - AddStatis(quantize_elementwise_add_count); + AddStatis(quantize_elementwise_count); - PrettyLogDetail("--- quantized %d elementwise_add ops", - quantize_elementwise_add_count); + PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count, + elementwise_type); } void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { @@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFc(graph); QuantizeReshape(graph); QuantizeMatmul(graph); - QuantizeElementwiseAdd(graph); + QuantizeElementwise(graph, "elementwise_add"); + QuantizeElementwise(graph, "elementwise_mul"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 412c4e40a01d50b73f72076f3a0424081d633247..3a286264e41ffe1c329ba3971d777ce4fbc05b5e 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeTranspose(Graph* graph) const; void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeElementwise(Graph* graph, + const std::string elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 889417b78c8641060b8ad89219749d8400558c6a..22000865948d629a5933ad0319e41dab71433fff 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); - } else if (type == "elementwise_add") { + } else if (type == "elementwise_add" || type == "elementwise_mul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetOutput("Out", {outputs[0]}); @@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add") { + } else if (type == "matmul" || type == "elementwise_add" || + type == "elementwise_mul") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); @@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) { expected_operators, added_nodes, 1.0f); } -static const std::initializer_list variable_names_elementwise_add = - {"a", "b", "c", "d", "e", "f"}; +static const std::initializer_list variable_names_elementwise = { + "a", "b", "c", "d", "e", "f"}; -ProgramDesc BuildProgramDescElementwiseAdd() { +ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type, + const std::string elementwise_name) { ProgramDesc prog; - for (auto& v : variable_names_elementwise_add) { + for (auto& v : variable_names_elementwise) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, + SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; } -TEST(CpuQuantizePass, elementwise_add) { +void TestElementwise(const std::string elementwise_type, + const std::string elementwise_name) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, SCALE * S8_MAX); + {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, + SCALE * S8_MAX); } -TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { +void TestElementwiseOutputScaleMissing(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "e"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "e"); } -TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { +void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "", "b"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "", "b"); +} + +TEST(CpuQuantizePass, elementwise_add) { + TestElementwise("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_mul) { + TestElementwise("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul"); } const std::vector churn_out_vars(ProgramDesc* prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 5f74b61ee86aad10880f3a67d8250026a6e9ac18..3b883dac9782af8350b3e22d2954e21789a1a120 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", "pool2d", - "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice"}); + {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", + "elementwise_mul", "fc", "matmul", "nearest_interp", + "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2", + "fusion_gru", "fusion_lstm", "multi_gru", "slice"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 22613cbe2a2b2cb2eb491142a58172a8a5235c59..865943696c35aee75f5b8e48326b3d61c1e58532 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { @@ -1148,63 +1150,221 @@ REGISTER_OP_CUDA_KERNEL( FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP -#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_KERNEL( \ - act_type, KP, plat::XPUPlace, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ - ops::ActivationGradCudaKernel>); - -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, - CudaReciprocalGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, - CudaSoftplusGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, - CudaHardSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, - CudaCELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, - CudaSiluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, - CudaLogSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, - CudaSoftShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, - CudaLog1pGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, - CudaBReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, - CudaSoftReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, - CudaSoftsignGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, - CudaRelu6GradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, - CudaHardShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, - CudaHardSigmoidFunctor, - CudaHardSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, - CudaSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, - CudaThresholdedReluFunctor, - CudaThresholdedReluGradFunctor); +REGISTER_OP_KERNEL( + brelu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + brelu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + ceil_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + celu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + elu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + exp_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + floor_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_shrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_shrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + leaky_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + leaky_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log1p_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + logsigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + logsigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + reciprocal, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + reciprocal_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu6_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + silu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + soft_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softplus_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + softshrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softshrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softsign_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sqrt_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(square, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + square_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + thresholded_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + thresholded_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); #endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 68083c759859b420bd6e60496614234a96519935..6959b5cf811069cc66321d2129a2b69d4e922f09 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,11 +24,6 @@ namespace operators { class DeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } }; class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", - "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", - framework::GradVarName("Input"), "DeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -162,11 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker); + ops::DeterminantGradOpMaker, + DeterminantInferShapeFunctor); -REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) +DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp, + DeterminantGradInferShapeFunctor); REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 838df2e1625912dad127b672228f9cc64eb7cec3..f9347d281043ecc63acdb8ca2fb0a18dae4adc47 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,100 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -116,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseAddMKLDNNGradKernel, - ops::EltwiseAddMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index 367d602f5902e816a468d43ccfa009fe35a045fc..c68aa8d3d1b46c9013c6fe6a12510f0cdb744682 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -1,146 +1,28 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout / y - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = -dout * out / y - - platform::BinaryMKLDNNHandler y_handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, - y, nullptr, 1.0f, 1.0f, 1.0f); - - const auto y_memory = y_handler.AcquireSrcMemory(y); - - dnnl::post_ops po; - po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_out_memory = handler.AcquireSecondSrcMemory(out); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_out_memory}, - {DNNL_ARG_DST, *dst_dy_memory}, - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// TODO(piotrekobi) add int8, uint8 support -REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseDivMKLDNNGradKernel, - ops::EltwiseDivMKLDNNGradKernel) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL( + elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ad8fd317013908e8908dff8bea3440e24779454e..761b401ca9a2e535e1badfee834ef9ee98a07aae 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,20 +15,35 @@ #pragma once #include #include -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; + +inline std::vector CalculateBroadcastedDims(const Tensor* x, + const Tensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + size_t j = 0; + std::vector dst_tz_ex(src_tz.size(), 1); + for (size_t i = 0; i < src_tz.size(); ++i) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + + return dst_tz_ex; +} template class EltwiseMKLDNNKernel : public framework::OpKernel { @@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); - std::shared_ptr dst_memory = nullptr; + std::shared_ptr dst_memory; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather @@ -135,19 +150,193 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { } }; -inline std::vector CalculateBroadcastedDims(const Tensor* x, - const Tensor* y) { - const auto src_tz = phi::vectorize(x->dims()); - const auto dst_tz = phi::vectorize(y->dims()); +template +class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; - size_t j = 0; - std::vector dst_tz_ex(src_tz.size(), 1); - for (size_t i = 0; i < src_tz.size(); ++i) { - dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; - if (j == dst_tz.size()) break; - } + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); - return dst_tz_ex; -} + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + int axis = ctx.Attr("axis"); + + auto tz = phi::vectorize(dout->dims()); + auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + + platform::ReorderMKLDNNHandler reorder_handler( + tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout), + onednn_engine); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } + + // elementwise_mul & elementwise_div + else { + platform::BinaryMKLDNNHandler binary_handler( + BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, + 1.0f, 1.0f); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); + const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + + if (dy) { + dnnl::primitive_attr broadcast_reduction_attr; + std::shared_ptr broadcast_src_memory; + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales(1); + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + + dst_memory = reorder_dst_memory_p; + } else { + broadcast_src_memory = reorder_src_memory_p; + } + } + + // elementwise_mul & elementwise_div + else { + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + platform::BinaryMKLDNNHandler binary_handler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + platform::BinaryMKLDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(), + y, y, nullptr, 1.0f, 1.0f, 1.0f); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = platform::BinaryMKLDNNHandler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(dout); + + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + dy->set_layout(DataLayout::kMKLDNN); + + if (dout->dims() != dy->dims()) { + // Broadcasting + if (BINARY_OP == dnnl::algorithm::binary_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + broadcast_reduction_attr.set_post_ops(po); + } + + platform::ReductionMKLDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(dy); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *broadcast_src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims())))); + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + } + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index c03794012ff3b793684222c62f423edd6e8637f1..0ef5c5e628ce62084305fc95e66862a15822ecb3 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -1,127 +1,19 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout*y - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = dout*x - // Handler is having nullptr passed instead of output tensor as - // we want Dst buffer to be allocated by oneDNN not to use Tensor - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_x_memory = handler.AcquireSecondSrcMemory(x); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_x_memory}, - {DNNL_ARG_DST, *dst_dy_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -132,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseMulMKLDNNGradKernel, - ops::EltwiseMulMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 3c799008a2abcf3fc59da7b759c9d43f3e940e8e..510373831eb6db5c7ffed6e8e58cbfb0ae268a50 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -1,5 +1,4 @@ - -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,113 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - - dnnl::primitive_attr reorder_attr; - std::vector scales = {-1}; - reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - - dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); - dnnl::primitive_attr attr; - attr.set_post_ops(po); - - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); - - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - reduction_p->execute(astream, { - {DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}, - }); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; @@ -131,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseSubMKLDNNGradKernel, - ops::EltwiseSubMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index c7fb92cd5107cee12e0995948e320ef3ed616f4d..9c16ccb138f7da56568ce6224dc30deb5bbccb7f 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of ModeOp must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of ModeOp must have >= 1d shape")); - if (axis < 0) axis += dim_size; - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input shape should >= 1d")); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor, + PD_INFER_META(phi::ModeInferMeta)); REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker, ops::ModeGradOpMaker, - ops::ModeGradOpMaker); -REGISTER_OP_CPU_KERNEL(mode, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel); - + ops::ModeGradOpMaker, + ModeInferShapeFunctor); REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad); -REGISTER_OP_CPU_KERNEL( - mode_grad, ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel); diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu deleted file mode 100644 index 2bacda8afb0eb340c4c8d4068f3013e2adbc7f91..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/mode_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" - -namespace paddle { -namespace operators { - -int ComputeBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -void getModebySort(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, - const int64_t num_cols, const int64_t num_rows, - T* out_tensor, int64_t* indices_tensor) { - framework::Tensor input_tmp; - framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp); - T* input_tmp_data = input_tmp.mutable_data(ctx.GetPlace()); - input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); - thrust::device_ptr out_tensor_ptr(out_tensor); - thrust::device_ptr indices_tensor_ptr(indices_tensor); - - for (int64_t i = 0; i < num_rows; ++i) { - T* begin = input_tmp_data + num_cols * i; - T* end = input_tmp_data + num_cols * (i + 1); - thrust::device_vector indices_data(num_cols); - thrust::sequence(thrust::device, indices_data.begin(), - indices_data.begin() + num_cols); - thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); - int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1, - begin + 1, 0, thrust::plus(), - thrust::not_equal_to()); - thrust::device_vector keys_data(unique); - thrust::device_vector cnts_data(unique); - thrust::reduce_by_key(thrust::device, begin, end, - thrust::constant_iterator(1), keys_data.begin(), - cnts_data.begin()); - auto it = thrust::max_element(thrust::device, cnts_data.begin(), - cnts_data.begin() + unique); - T mode = keys_data[it - cnts_data.begin()]; - int64_t counts = cnts_data[it - cnts_data.begin()]; - auto pos = thrust::find(thrust::device, begin, end, mode); - int64_t index = indices_data[pos - begin + counts - 1]; - out_tensor_ptr[i] = static_cast(mode); - indices_tensor_ptr[i] = static_cast(index); - } -} - -template -class ModeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - getModebySort(dev_ctx, input, input_width, input_height, output_data, - indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - for (int i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - // second step, tranpose the input - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, ctx.GetPlace()); - int ndims = trans_axis.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans_axis); - framework::Tensor trans_ind; - int64_t* trans_ind_data = - trans_ind.mutable_data(trans_out_shape, ctx.GetPlace()); - framework::Tensor trans_out; - T* trans_out_data = - trans_out.mutable_data(trans_out_shape, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - getModebySort(dev_ctx, &trans_input, input_width, input_height, - trans_out_data, trans_ind_data); - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans_axis); - TransCompute(ndims, dev_ctx, trans_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - - if (axis < 0) axis += in_dims.size(); - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - int block_size = ComputeBlockSize(post); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mode, ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - mode_grad, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel); diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h deleted file mode 100644 index 76d356ed16eb3f81b10d541230f49b73fd836543..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.h +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -static void getMode(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - T mode = 0; - int64_t indice = 0; - int64_t cur_freq = 0; - int64_t max_freq = 0; - for (int64_t i = 0; i < input_width; ++i) { - ++cur_freq; - if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { - if (cur_freq > max_freq) { - max_freq = cur_freq; - mode = col_vec[i].first; - indice = col_vec[i].second; - } - cur_freq = 0; - } - } - t_out[i] = mode; - t_indices[i] = indice; - } -} - -template -static void ModeAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class ModeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - bool keepdim = static_cast(context.Attr("keepdim")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - // if axis is not the last dim, transpose it to the last dim, do the - // calculation, - // then tranpose it back to orginal axis. - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getMode(input_height, input_width, in_dims.size(), input, - output_data, indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - // get the trans input_dims, out_dims - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_input, trans_axis); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_shape, context.GetPlace()); - framework::Tensor tmp_indices; - auto* t_ind = tmp_indices.mutable_data(trans_out_shape, - context.GetPlace()); - - getMode(input_height, input_width, in_dims.size(), - &trans_input, t_out, t_ind); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans_axis); - TransCompute(ndims, dev_context, tmp_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - // allocate the memory for the input_grad - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - if (keepdim) { - ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices, - x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - framework::DDim trans_shape(out_dims); - framework::DDim trans_in_shape(in_dims); - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = out_dims[trans_axis[i]]; - trans_in_shape[i] = in_dims[trans_axis[i]]; - } - // transpose the out_grad, indices - framework::Tensor trans_dO; - trans_dO.mutable_data(trans_shape, context.GetPlace()); - framework::Tensor trans_ind; - trans_ind.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - if (keepdim) { - // Do transpose - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans_axis); - } else { - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - // Do transpose - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans_axis); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); - const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; - - // Assign the out_grad to tranpose input_grad - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_shape, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - ModeAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans_axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 867d854ba3c9d0954dfe2d038405daf1726a2556..3a1b45d3a20a1e3ff6698f37e412837fcb064f7c 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return LoDTensorType::get( parser.getContext(), shape, elementType, lod_level); } + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } if (keyword == "dense_tensor") { // parse DenseTensor, for example: !i=Infrt.tensor llvm::StringRef target; @@ -158,6 +161,10 @@ void InfrtDialect::printType(::mlir::Type type, << lod_tensor_type.getLod_level() << ">"; return; } + if (type.isa()) { + os << "dense_tensor_map"; + return; + } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 18c25827b8ec5a71907e694cea4e7680b598e883..96aecb755898466ee9530cb3bdc894e51d074cd7 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -13,15 +13,17 @@ // limitations under the License. #include "paddle/infrt/host_context/paddle_mlir.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { - context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); + context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } @@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( UpdateModelParams(program, &mainFunc); UpdateModelOps(program); UpdateModelOutputs(program); - return module_; } @@ -171,7 +172,11 @@ void MLIRModelGenImpl::UpdateModelParams( ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), builder_, &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -197,8 +202,9 @@ void MLIRModelGenImpl::UpdateModelOutputs( llvm::SmallVector resultTypes; llvm::SmallVector attrs; + mlir::OperationState state(loc, - mlir::ReturnOp::getOperationName(), + ::infrt::ReturnOp::getOperationName(), operands, resultTypes, attrs); @@ -321,7 +327,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( switch (type) { ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr); ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr); - ATTR_IMPL_CASE(INT, i, getI32IntegerAttr); + ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr); ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr); ATTR_IMPL_CASE(STRING, s, getStringAttr); diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index e5cc1ec1121fb7bbff2fad7856151916d8ea0924..58543a6864258bd6c0153150bb535262d9a8f00d 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,3 +1,5 @@ +cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS}) + configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py new file mode 100644 index 0000000000000000000000000000000000000000..dd1632bc9d4d8e4e6ea0fb918d1179f4e28a441b --- /dev/null +++ b/paddle/infrt/tests/model/abs_model.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn import Layer +from paddle.static import InputSpec +from paddle.jit import to_static +import sys + + +class AbsNet(paddle.nn.Layer): + def __init__(self): + super(AbsNet, self).__init__() + + def forward(self, x): + x = paddle.abs(x) + return x + + +if __name__ == '__main__': + # build network + model = AbsNet() + # save inferencing format model + net = to_static( + model, input_spec=[InputSpec( + shape=[None, 1, 28, 28], name='x')]) + paddle.jit.save(net, sys.argv[1]) diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc new file mode 100644 index 0000000000000000000000000000000000000000..5de159b86fce29f774b07770aaaee0c1b6aebd31 --- /dev/null +++ b/paddle/infrt/tests/model/test_abs.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "llvm/Support/DynamicLibrary.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/registry.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" +#include "paddle/infrt/host_context/paddle_mlir.h" + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + +static llvm::cl::list cl_shared_libs( // NOLINT + "shared_libs", + llvm::cl::desc("Specify shared library with kernels."), + llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated); + +TEST(ABS_MODEL, convert_and_execute) { + std::string model_file_name = "./abs.pdmodel"; + std::string params_file_name = "./abs.pdiparams"; + // convert model + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); + // pick kernel + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + context->allowUnregisteredDialects(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->loadAllAvailableDialects(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); + + if (mlir::failed(pm.run(module_))) { + std::cout << "\npass failed!\n" << std::endl; + } + module_.dump(); + + // executate + infrt::host_context::KernelRegistry registry; + infrt::kernel::RegisterBasicKernels(®istry); + infrt::kernel::RegisterTestKernels(®istry); + infrt::kernel::RegisterTensorShapeKernels(®istry); + infrt::kernel::RegisterTensorKernels(®istry); + infrt::kernel::RegisterControlFlowKernels(®istry); + infrt::kernel::RegisterPhiKernels(®istry); + infrt::kernel::RegisterInferShapeLaunchers(®istry); + // load extra shared library + for (const auto& lib_path : cl_shared_libs) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + break; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = + reinterpret_cast( + reg_sym); + reg_func(®istry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + infrt::host_context::TestMlir(module_, ®istry); +} diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 262ada3eaf3169bebc919940e7630a75b0733cd9..f81f4a1b7c7390baca82d55abecbc49d0f67c235 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -648,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); } +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + errors::InvalidArgument( + "the axis of ModeOp must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + errors::InvalidArgument("input of ModeOp must have >= 1d shape")); + if (axis < 0) axis += dim_size; + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + PADDLE_ENFORCE_GE(input_dims.size(), + 1, + errors::InvalidArgument("input shape should >= 1d")); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 3dfc9b797c089281cd9631642640a54be05ce679..eb894003e5354222bff14945cc0e2aeb565b4e3d 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -112,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, MetaTensor* mask, MetaConfig config = MetaConfig()); +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca813c1757eacce24ecea8687b7b80bd43c5e8f9 --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(x_grad); + + if (axis == in_dims.size() - 1) { + // allocate the memory for the input_grad + // assign the out_grad to input_grad directly + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + if (keepdim) { + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + DDim trans_shape(out_dims); + DDim trans_in_shape(in_dims); + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = out_dims[trans_axis[i]]; + trans_in_shape[i] = in_dims[trans_axis[i]]; + } + // transpose the out_grad, indices + DenseTensor trans_dO; + trans_dO.Resize(trans_shape); + dev_ctx.template Alloc(&trans_dO); + + DenseTensor trans_ind; + trans_ind.Resize(trans_shape); + dev_ctx.template Alloc(&trans_ind); + + int ndims = trans_axis.size(); + + if (keepdim) { + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans_axis); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); + const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; + + // Assign the out_grad to tranpose input_grad + DenseTensor tmp_out; + tmp_out.Resize(trans_in_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + + // Transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_out, x_grad, trans_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + CPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6535d1b89af420ee4266981f004983157179f34f --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_kernel.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + auto out_dims = out->dims(); + // axis < 0, cacluate the real axis + if (axis < 0) axis += in_dims.size(); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + // if axis is not the last dim, transpose it to the last dim, do the + // calculation, then tranpose it back to original axis. + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetMode(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + // get the trans input_dims, out_dims + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + int ndims = trans_axis.size(); + + // transpose the input value + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_out_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + + DenseTensor tmp_indices; + tmp_indices.Resize(trans_out_shape); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + + funcs::GetMode( + input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind); + // transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc index 585c27bdcec97e11a68cdc536c829f76c000a8df..a5c9dc4c55e495833f40ec7499e6c0373594d319 100644 --- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc index d0413457f8177338aa450211539dc16d0880c74c..ad76a7a86bcb28f291288418c43740ed0b7adb97 100644 --- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc @@ -18,5 +18,11 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + CPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h new file mode 100644 index 0000000000000000000000000000000000000000..1b7641762e2639acf3db540280891b518f22eed2 --- /dev/null +++ b/paddle/phi/kernels/funcs/mode.h @@ -0,0 +1,197 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#ifdef PADDLE_WITH_MKLML +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +static int ComputeBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +static inline void GetDims( + const phi::DDim& dim, int axis, int* pre, int* n, int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template +static void GetMode(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + std::sort(col_vec.begin(), + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + T mode = 0; + int64_t indice = 0; + int64_t cur_freq = 0; + int64_t max_freq = 0; + for (int64_t i = 0; i < input_width; ++i) { + ++cur_freq; + if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { + if (cur_freq > max_freq) { + max_freq = cur_freq; + mode = col_vec[i].first; + indice = col_vec[i].second; + } + cur_freq = 0; + } + } + t_out[i] = mode; + t_indices[i] = indice; + } +} + +template +static void ModeAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +static void GetModebySort(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + T* out_tensor, + int64_t* indices_tensor) { + DenseTensor input_tmp; + input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); + T* input_tmp_data = dev_ctx.Alloc(&input_tmp); + phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp); + + thrust::device_ptr out_tensor_ptr(out_tensor); + thrust::device_ptr indices_tensor_ptr(indices_tensor); + + for (int64_t i = 0; i < num_rows; ++i) { + T* begin = input_tmp_data + num_cols * i; + T* end = input_tmp_data + num_cols * (i + 1); + thrust::device_vector indices_data(num_cols); + thrust::sequence( + thrust::device, indices_data.begin(), indices_data.begin() + num_cols); + thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); + int unique = 1 + thrust::inner_product(thrust::device, + begin, + end - 1, + begin + 1, + 0, + thrust::plus(), + thrust::not_equal_to()); + thrust::device_vector keys_data(unique); + thrust::device_vector cnts_data(unique); + thrust::reduce_by_key(thrust::device, + begin, + end, + thrust::constant_iterator(1), + keys_data.begin(), + cnts_data.begin()); + auto it = thrust::max_element( + thrust::device, cnts_data.begin(), cnts_data.begin() + unique); + T mode = keys_data[it - cnts_data.begin()]; + int64_t counts = cnts_data[it - cnts_data.begin()]; + auto pos = thrust::find(thrust::device, begin, end, mode); + int64_t index = indices_data[pos - begin + counts - 1]; + out_tensor_ptr[i] = static_cast(mode); + indices_tensor_ptr[i] = static_cast(index); + } +} +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 4cf5e1c02c59757ee8bd0ae91c18d0882b702da1..417c1cd234754f994383988c63ff44ba06794822 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // paddle::platform::ChangeThreadNum(context, &thread_num); + // backends::gpu::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -460,7 +460,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -527,7 +527,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1293,7 +1293,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1369,7 +1369,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc index bf4a21f37223dab5a67649406496e9828b0bcf3f..fbd744430aa11ab1a5a17c76b6d37c10c3085556 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cc +++ b/paddle/phi/kernels/funcs/segment_pooling.cc @@ -149,10 +149,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 305cd39f077bc359543b399a8775b5a92a2eb00d..95606b152672916116813c97cbbc0856d33e49a7 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -453,10 +453,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..43502621c2d3a878a144de1878aa09b8d64b6a47 --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu @@ -0,0 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +__global__ void AssignGradWithAxis(const T* grad_out, + const int64_t* indices, + T* grad_in, + int pre, + int post, + int raw_height, + int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + int base_index = i * post * k; + int base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + __syncthreads(); + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + int64_t idx_ij = indices[base_index + j]; + int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[base_index + j]; + } + } +} + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + + if (axis < 0) axis += in_dims.size(); + // allocate the cuda memory for the x_grad + T* x_grad_data = dev_ctx.template Alloc(x_grad); + const T* out_grad_data = out_grad.data(); + const int64_t* indices_data = indices.data(); + + int pre, n, post; + funcs::GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + int block_size = funcs::ComputeBlockSize(post); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + GPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..629b9722cd6bcfe12d0fb5a7e8be6439f5ea286f --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + // get the input dims + const auto& in_dims = x.dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto out_dims = out->dims(); + + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetModebySort( + dev_ctx, &x, input_width, input_height, output_data, indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + for (int i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + // second step, tranpose the input + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + + int ndims = trans_axis.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + DenseTensor trans_ind; + trans_ind.Resize(trans_out_shape); + int64_t* trans_ind_data = dev_ctx.template Alloc(&trans_ind); + + DenseTensor trans_out; + trans_out.Resize(trans_out_shape); + T* trans_out_data = dev_ctx.template Alloc(&trans_out); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + funcs::GetModebySort(dev_ctx, + &trans_input, + input_width, + input_height, + trans_out_data, + trans_ind_data); + // last step, tranpose back the indices and output + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans_axis); + funcs::TransCompute(ndims, dev_ctx, trans_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu index d9618dc159a6d3f5b24bdfcfdb219ec649e051f9..9d1769e18b4b809fbc353513a05553e0ccd97572 100644 --- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu @@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu index c38e935adf837ef00c48fa31bc1e37eea2948673..3128e534166acba6ca136331ad8efea66b18621f 100644 --- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu @@ -19,5 +19,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + GPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 2b2dd5118969cf35c4762f3ab774ce41c04d2e4d..77159bfc876da603f703a13592f525d808adfbbf 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -121,17 +121,10 @@ struct ReduceMaxFunctor { }; template -struct ExpSubFunctor { - HOSTDEVICE inline ExpSubFunctor() { y = static_cast(0.0f); } - - HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {} - +struct ExpFunctor { HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(std::exp(x - y)); + return static_cast(std::exp(x)); } - - private: - Tx y; }; template @@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax, } // data src - AccT srcdata[kBatchSize][kLoopsV][kVSize]; - T src_tmp[kBatchSize][kLoopsV][kVSize]; - kps::Init(&srcdata[0][0][0], kLowInf); - kps::Init(&src_tmp[0][0][0], -std::numeric_limits::infinity()); + // src_data: the raw data form global memory + // sub_data: store the data obtained by (src_data - max), used by log_softmax + // exp_data: store the data obtained by (exp(sub_data)), used by softmax + T src_data[kBatchSize][kLoopsV][kVSize]; + AccT sub_data[kBatchSize][kLoopsV][kVSize]; + AccT exp_data[kBatchSize][kLoopsV][kVSize]; + kps::Init(&sub_data[0][0][0], kLowInf); + kps::Init(&src_data[0][0][0], -std::numeric_limits::infinity()); // data dst T out_tmp[kBatchSize][kLoopsV][kVSize]; @@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax, for (int i = 0; i < kBatchSize; ++i) { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); - VecT* reg_v = reinterpret_cast(&src_tmp[i][0][0]); + VecT* reg_v = reinterpret_cast(&src_data[i][0][0]); kps::ReadData( ®_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1); kps::ElementwiseUnary>( - &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor()); + &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor()); } // compute max @@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax, 1, ReduceMaxFunctor, kMode::kLocalMode>( - &max[0], &srcdata[0][0][0], ReduceMaxFunctor(), true); + &max[0], &sub_data[0][0][0], ReduceMaxFunctor(), true); WarpReduceMax(max); // compute sum #pragma unroll for (int i = 0; i < kBatchSize; ++i) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor(max[i])); + kps::ElementwiseUnary>( + &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor(max[i])); + kps::ElementwiseUnary>( + &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor()); } kps::Reduce, kMode::kLocalMode>( - &sum[0], &srcdata[0][0][0], kps::AddFunctor(), true); + &sum[0], &exp_data[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); // write data to global memory @@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax, reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); if (LogMode) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); kps::ElementwiseUnary>( &out_tmp[i][0][0], - &srcdata[i][0][0], + &sub_data[i][0][0], UnarySubFunctor(std::log(sum[i]))); } else { kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor(sum[i])); } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ccde8c3648fa556401f1937c78039743daf43f4c --- /dev/null +++ b/paddle/phi/kernels/mode_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..831c4369304e5c5d27cddf01bcba021745bf7083 --- /dev/null +++ b/paddle/phi/kernels/mode_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); + +} // namespace phi diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..20994c08aa73c33328568e334d258c44eef68171 --- /dev/null +++ b/paddle/phi/ops/compat/mode_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); +} + +KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mode_grad", + {"X", "Indices", GradVarName("Out")}, + {"axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 3b2df68074a82b7485b8c8f67e7d0d0fadf5fbd9..850d4015abf7a8164add9d4896d5a9bdfa26989d 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -44,6 +44,8 @@ function update_pd_ops() { cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py + # generate test model + python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } function init() { diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index 3fadf25150f9ef3556a343fdce8acc24d788f5dc..f97c2778c0918ecbfbed546089c17e9d505818cd 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -52,6 +52,30 @@ def parse_args(): '--debug', action='store_true', help='If used, the graph of Quant model is drawn.') + parser.add_argument( + '--quant_model_filename', + type=str, + default="", + help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.' + ) + parser.add_argument( + '--quant_params_filename', + type=str, + default="", + help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.' + ) + parser.add_argument( + '--save_model_filename', + type=str, + default="__model__", + help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.' + ) + parser.add_argument( + '--save_params_filename', + type=str, + default=None, + help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files' + ) test_args, args = parser.parse_known_args(namespace=unittest) return test_args, sys.argv[:1] + args @@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path, save_path, ops_to_quantize='', op_ids_to_skip='', - debug=False): + debug=False, + quant_model_filename='', + quant_params_filename='', + save_model_filename='', + save_params_filename=''): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): - if os.path.exists(os.path.join(original_path, '__model__')): - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe) + if not quant_model_filename: + if os.path.exists(os.path.join(original_path, '__model__')): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(original_path, + exe) + else: + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, 'model', 'params') else: [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe, - 'model', 'params') + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, quant_model_filename, + quant_params_filename) ops_to_quantize_set = set() print(ops_to_quantize) @@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path, graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): - fluid.io.save_inference_model(save_path, feed_target_names, - fetch_targets, exe, inference_program) + fluid.io.save_inference_model( + save_path, + feed_target_names, + fetch_targets, + exe, + inference_program, + model_filename=save_model_filename, + params_filename=save_params_filename) print( "Success! INT8 model obtained from the Quant model can be found at {}\n" .format(save_path)) @@ -109,4 +150,6 @@ if __name__ == '__main__': test_args, remaining_args = parse_args() transform_and_save_int8_model( test_args.quant_model_path, test_args.int8_model_save_path, - test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug) + test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug, + test_args.quant_model_filename, test_args.quant_params_filename, + test_args.save_model_filename, test_args.save_params_filename) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index d0552ca41f0daf56ce23317dd06cb5744baaff84..d8b1883fc62a0fb4575a2e525d7d37a9029cf40d 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -35,6 +35,12 @@ final_state_name_mapping = { "x": "X", "out": "Out", }, + "pool2d": { + "final_op_name": "final_state_pool2d", + "x": "X", + "kernel_size": "ksize", + "out": "Out", + }, "abs": { "final_op_name": "final_state_abs", "x": "X", diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d1d391a3949ead28697c0756803e873c41914079..318e826058f2c111f825b113c8ee4676ff87d630 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 00967cb503fe5fd677839a869798964bb5fb0b71..b35b2840ed30a2650e6e19a4cfbc381f50fd5024 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 69bca8dd9ef15459021f44fd1b4887e636516ec6..66f2e871dac462c8e6e47357e7367755d2fc0cfc 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -849,6 +849,38 @@ def ref_softsign(x): return out +class XPUTestSoftshrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softshrink' + self.use_dynamic_create_class = False + + class XPUTestSoftshrink(TestActivationOPBase): + def set_case(self): + self.op_type = "softshrink" + self.dtype = self.in_type + + threshold = 0.5 + np.random.seed(1023) + x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype) + out = ref_softshrink(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softshrink') +for stype in support_types: + create_test_class(globals(), XPUTestSoftshrinkOP, stype) + + +def ref_softshrink(x, threshold=0.5): + out = np.copy(x) + out = (out < -threshold) * (out + threshold) + (out > threshold) * ( + out - threshold) + return out + + class XPUTestSwishOP(XPUOpTestWrapper): def __init__(self): self.op_name = 'swish' @@ -879,5 +911,36 @@ def ref_swish(x): return out +class XPUTestThresholdedReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'thresholded_relu' + self.use_dynamic_create_class = False + + class XPUTestThresholdedRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "thresholded_relu" + self.dtype = self.in_type + + threshold = 1.0 + np.random.seed(1024) + x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_thresholded_relu(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('thresholded_relu') +for stype in support_types: + create_test_class(globals(), XPUTestThresholdedReluOP, stype) + + +def ref_thresholded_relu(x, threshold=1.0): + out = (x > threshold) * x + return out + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index cb85ad0b7411c120b2704eb1639889202d77a0de..cb5458cf550103896a730fc7f248d3b8bfd88bbc 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -30,7 +30,7 @@ def segment_sum(data, segment_ids, name=None): where sum is over j such that `segment_ids[j] == i`. Args: - data (Tensor): A tensor, available data type float32, float64. + data (Tensor): A tensor, available data type float32, float64, int32, int64. segment_ids (Tensor): A 1-D tensor, which have the same size with the first dimension of input data. Available data type is int32, int64. @@ -57,7 +57,8 @@ def segment_sum(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -85,7 +86,7 @@ def segment_mean(data, segment_ids, name=None): of all index 'segment_ids[j] == i'. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -113,7 +114,8 @@ def segment_mean(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -140,7 +142,7 @@ def segment_min(data, segment_ids, name=None): where min is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -167,7 +169,8 @@ def segment_min(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -194,7 +197,7 @@ def segment_max(data, segment_ids, name=None): where max is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -221,7 +224,8 @@ def segment_max(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6479025e71ca74267647837e56392d9df9acfc33..211b677aeeaff02feb6573e688cf0a8dc9213134 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -141,6 +141,14 @@ output : Tensor invoke : full_like(x, 1, dtype, place) +- api : pool2d + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) + output : Tensor(out) + infer_meta : + func : PoolInferMeta + kernel: + func : pool2d + - api : reshape args : (Tensor x, ScalarArray shape) output : Tensor(out)