diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 6a2e5e7ac6cd75068bba4e9b675ab67588c38366..bf838b27615028167e35a8e85e7636dd4c834016 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) { return ret; } -static bool IgnoreGradAttribute(const std::string& op_type, - const std::string& attr_name) { - // Attributes in operators_with_attrs are created manually during code - // generation - // We should ignore these arbitrary attrs when setting up grad attribute map - if (operators_with_attrs.count(op_type)) { - if (operators_with_attrs[op_type].count(attr_name)) { - return true; - } - } +static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type, + const std::string& attrs_name) { + std::string additional_grad_attrs_str = ""; + + if (fwd_op_type == "sum") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + additional_grad_attrs_str = paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); - // Only allow SumOp - if (op_type != "sum") { - return true; + } else if (fwd_op_type == "scale") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); } - return false; + return additional_grad_attrs_str; } static void PrepareAttrMapForOps() { @@ -1866,18 +1872,9 @@ static std::string GenerateSingleOpBase( const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n"; std::string grad_attrs_str = paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name); - for (const auto& iter : grad_attrs) { - if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue; - std::pair type_val = - GetAttrType(iter.second, false /*is_arg*/); - const char* GRAD_ATTRS_TEMPLATE = - " %s %s = %s;\n" - " %s[\"%s\"] = %s;\n"; - std::string var_name = iter.first + std::to_string(*outs_size); - grad_attrs_str += paddle::string::Sprintf( - GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second, - attrs_name, iter.first, var_name); - } + + // Handle dynamic grad attributes + grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name); generated_grad_function_body += grad_attrs_str; const char* TRACE_OP_TEMPLATE = diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 656418a05ad6d04bc19838c97d86db9cda19c1c6..d2d699e154f91d21e94e9d2dac3f703069d041e5 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -28,6 +28,7 @@ namespace = "" yaml_types_mapping = { 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'str' : 'std::string', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', @@ -212,7 +213,8 @@ def ParseYamlArgs(string): default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None - assert arg_type in yaml_types_mapping.keys() + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." arg_type = yaml_types_mapping[arg_type] arg_name = RemoveSpecialSymbolsInName(arg_name) @@ -247,7 +249,8 @@ def ParseYamlReturns(string): else: ret_type = ret.strip() - assert ret_type in yaml_types_mapping.keys() + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type @@ -1245,7 +1248,7 @@ if __name__ == "__main__": # Node Definition Generation definition_declaration_pair = GenerateForwardDefinition( fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, + forward_outputs_position_map, orig_forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs, intermediate_outputs) @@ -1257,7 +1260,7 @@ if __name__ == "__main__": # For python-level API dispatch CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, forward_outputs_position_map, - forward_attrs_list) + orig_forward_attrs_list) if len(namespace) > 0: forward_definition_str += f"""namespace {namespace} {{ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9b77f0449e01d6555cd3a25f101e4867ccc6ffd3..c0ed77ecdc4821c47410beeb06d82ed82812df40 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -24,7 +24,7 @@ atype_to_parsing_function = { "long": "CastPyArg2Long", "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", - "string": "CastPyArg2String", + "std::string": "CastPyArg2String", "std::vector": "CastPyArg2Booleans", "std::vector": "CastPyArg2Ints", "std::vector": "CastPyArg2Longs", diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 1a4f283f511da4300d26e764907998ad647eeebf..589d09bf81c1d95795cd80ed22581e52156ae417 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, return; } + // NOTE(hqp): Special case for CPU->MLU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) { + paddle::framework::TensorCopy( + in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), + out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 48850d4624a14c32d30e4562b322115127823c6b..f951b5d0f507039decfc3b4d0081f17cc9f8f50e 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ctx->ops_, place_); #endif - auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 31a30f72e3aa6120efbaa158b1d1786dda155145..432e57107e84d9399c76f21343fe9ef5dd879473 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; @@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } timeline.Start(); @@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, + i, reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); bool flag = true; @@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() + VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; if (multi_node_) { auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); @@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() << " seconds."; } @@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() { "[BeginPass] after build_task, current task is not null.")); } - VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::EndPass() { @@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() { current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 036fde8fac6d911f8a97dbc097fae7f9fdd2ab6f..f5f6f3ecb855cfa9acb6c2169f1fc43458578a2a 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -95,6 +95,7 @@ std::map> Graph::InitFromBlock( std::unordered_map> name_to_desc_block_id; + block_id_ = block.ID(); const BlockDesc *block_var_visible = █ while (block_var_visible != nullptr) { for (auto *var : block_var_visible->AllVars()) { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 21e743e3587d80536b7bd4805298f22a99482217..10645f08dc3ba833c3a4ca75a1ac623ee2c1e8e9 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -230,6 +230,7 @@ class Graph { auto *x = AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -245,6 +246,7 @@ class Graph { "The OpDesc used to create operator node is null.")); auto *x = AddNode(new ir::Node(op_desc)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -263,6 +265,7 @@ class Graph { num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -276,6 +279,7 @@ class Graph { } auto *x = AddNode(new ir::Node(name, type, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 18068e22b7f3c31d59636bc7ab6a234e109d5ee6..164a13d1560f4d0008c2bdb5a56d8ad6f875157b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() { return output_var; } -PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { - auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) - ->assert_is_op("elementwise_add"); - - x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); - y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); - auto out_var = pattern->NewNode(elementwise_add_out_repr()) +PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, + const std::string elementwise_type) { + auto elementwise_op = + pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); + + x_var->AsInput()->assert_is_op_input(elementwise_type, "X"); + y_var->AsInput()->assert_is_op_input(elementwise_type, "Y"); + auto out_var = pattern->NewNode(elementwise_out_repr()) ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output(elementwise_type, "Out"); - elementwise_add_op->LinksFrom({x_var, y_var}); - elementwise_add_op->LinksTo({out_var}); + elementwise_op->LinksFrom({x_var, y_var}); + elementwise_op->LinksTo({out_var}); return out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 062d2f9dedce65f6e16b70f0b201a4ca63b0531a..17c70ace301d39db6fcf14d01c11baab0dc7d403 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1016,20 +1016,20 @@ struct Pool : public PatternBase { PATTERN_DECL_NODE(pool_output); }; -// ElementwiseAdd used in residual connections. -// y_var is used and convolution output. -// The operator is removed, when residual -// connection fusion is on. -struct ElementwiseAdd : public PatternBase { - ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add") {} - - PDNode* operator()(PDNode* x_var, PDNode* y_var); - - PATTERN_DECL_NODE(elementwise_add_op); - PATTERN_DECL_NODE(elementwise_add_x); - PATTERN_DECL_NODE(elementwise_add_y); - PATTERN_DECL_NODE(elementwise_add_out); +// Elementwise ops +// Forward pass for element-wise operators (add, mul) +// elementwise_mul_out is the result of the operator +struct Elementwise : public PatternBase { + Elementwise(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise") {} + + PDNode* operator()(PDNode* x_var, PDNode* y_var, + const std::string elementwise_type); + + PATTERN_DECL_NODE(elementwise_op); + PATTERN_DECL_NODE(elementwise_x); + PATTERN_DECL_NODE(elementwise_y); + PATTERN_DECL_NODE(elementwise_out); }; // Transpose op diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 2403e60df3918394e99c3284b2a417e336fc3bae..fc2758c27345032c1ad0831b4ee0016fa84b3f5c 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -118,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) @@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - conv_output, - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_x_count = 0; @@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + if (!IsReachable(g, elementwise_identity, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_x_count++; }; @@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - conv_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output, + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_y_count = 0; @@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_x, conv_output)) return; + if (!IsReachable(g, elementwise_x, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_x, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_y_count++; }; @@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( patterns::Conv conv_y_pattern{pattern, name_scope}; auto conv_y_output = conv_y_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern(conv_x_output, conv_y_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add"); conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); @@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!IsCompat(subgraph, g)) { LOG(WARNING) @@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( return; } - if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; - if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return; Node* projection_node; Node* residual_conv_op; @@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( if (HasFusedActivation(residual_conv_op)) return; residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op}); IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + IR_NODE_LINK_TO(residual_conv_op, elementwise_out); found_projection_conv_count++; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 371482b5343d638f005aa8e0700680b6ac00d6ec..f4358fb243f20bc9b024ef6b02768773fa995f45 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); } -void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { +void CPUQuantizePass::QuantizeElementwise( + Graph* graph, const std::string elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + patterns::Elementwise elementwise_pattern{pattern, name_scope_}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), + pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + elementwise_type); - int quantize_elementwise_add_count = 0; + int quantize_elementwise_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize elementwise_add op"; - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); + VLOG(4) << "Quantize " + elementwise_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) { - LogQuantizationDisabled(elementwise_add_op); + if (!platform::HasOpINT8DataType(elementwise_op->Op())) { + LogQuantizationDisabled(elementwise_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!AreScalesPresentForNodes( - {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { - LogCannotQuantizeOp(elementwise_add_op, + {elementwise_x, elementwise_y, elementwise_out})) { + LogCannotQuantizeOp(elementwise_op, "No scale available for the operator"); return; } bool is_x_unsigned{false}, is_y_unsigned{false}; - auto input_x_scale = - GetScaleValueForNode(elementwise_add_x, &is_x_unsigned); - auto input_y_scale = - GetScaleValueForNode(elementwise_add_y, &is_y_unsigned); + auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned); // TODO(sfraczek): add support for different signness if (is_x_unsigned != is_y_unsigned) { - LogCannotQuantizeOp(elementwise_add_op, - "ElementwiseAdd inputs must be of the same type."); + LogCannotQuantizeOp(elementwise_op, + "Elementwise inputs must be of the same type."); return; } - QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale, + QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale, is_x_unsigned, "Scale_x"); - QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, + QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + GetScaleValueForNode(elementwise_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); + DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); - ++quantize_elementwise_add_count; + ++quantize_elementwise_count; }; gpd(graph, handler); - AddStatis(quantize_elementwise_add_count); + AddStatis(quantize_elementwise_count); - PrettyLogDetail("--- quantized %d elementwise_add ops", - quantize_elementwise_add_count); + PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count, + elementwise_type); } void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { @@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFc(graph); QuantizeReshape(graph); QuantizeMatmul(graph); - QuantizeElementwiseAdd(graph); + QuantizeElementwise(graph, "elementwise_add"); + QuantizeElementwise(graph, "elementwise_mul"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 412c4e40a01d50b73f72076f3a0424081d633247..3a286264e41ffe1c329ba3971d777ce4fbc05b5e 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeTranspose(Graph* graph) const; void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeElementwise(Graph* graph, + const std::string elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 889417b78c8641060b8ad89219749d8400558c6a..22000865948d629a5933ad0319e41dab71433fff 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); - } else if (type == "elementwise_add") { + } else if (type == "elementwise_add" || type == "elementwise_mul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetOutput("Out", {outputs[0]}); @@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add") { + } else if (type == "matmul" || type == "elementwise_add" || + type == "elementwise_mul") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); @@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) { expected_operators, added_nodes, 1.0f); } -static const std::initializer_list variable_names_elementwise_add = - {"a", "b", "c", "d", "e", "f"}; +static const std::initializer_list variable_names_elementwise = { + "a", "b", "c", "d", "e", "f"}; -ProgramDesc BuildProgramDescElementwiseAdd() { +ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type, + const std::string elementwise_name) { ProgramDesc prog; - for (auto& v : variable_names_elementwise_add) { + for (auto& v : variable_names_elementwise) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, + SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; } -TEST(CpuQuantizePass, elementwise_add) { +void TestElementwise(const std::string elementwise_type, + const std::string elementwise_name) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, SCALE * S8_MAX); + {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, + SCALE * S8_MAX); } -TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { +void TestElementwiseOutputScaleMissing(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "e"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "e"); } -TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { +void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "", "b"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "", "b"); +} + +TEST(CpuQuantizePass, elementwise_add) { + TestElementwise("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_mul) { + TestElementwise("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul"); } const std::vector churn_out_vars(ProgramDesc* prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 5f74b61ee86aad10880f3a67d8250026a6e9ac18..3b883dac9782af8350b3e22d2954e21789a1a120 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", "pool2d", - "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice"}); + {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", + "elementwise_mul", "fc", "matmul", "nearest_interp", + "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2", + "fusion_gru", "fusion_lstm", "multi_gru", "slice"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 7e61d6ae4248b3f41fd950fcf80e0306bd0971bb..8c51c278d4872bd5b0b019223fb0e778df390732 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -125,6 +125,7 @@ class Node { // Only use this for auto parallel. // A node does not have original desc if the return is zero. uint64_t OriginalDescId() const { return original_desc_id_; } + int GraphId() const { return graph_id_; } bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -246,10 +247,12 @@ class Node { // Store the original id of var desc or op desc. // Only use this for auto parallel. uint64_t original_desc_id_{0}; + int graph_id_{-1}; private: // ID can only set by a Graph. void SetId(int id) { id_ = id; } + void SetGraphId(int graph_id) { graph_id_ = graph_id; } // desc_order can only set by a Graph when constructing a Graph from a // BlockDesc. diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ece48158586404cfe0c956fd66a20dc0db1ce96e..f30d1ea1b83dde65bdb703d511d0246fe4886113 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; for (auto &op : ops_) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f23a266ef03641bc8f8d273b15ab4982e377cb03..ad01adf1a25b9d65c6ca85bc7e7a40d4b1fd0198 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif -#ifdef PADDLE_WITH_XPU + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || @@ -1470,17 +1471,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_XPU_KP - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(type_); - if (platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.library_type_ = LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << type_ - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << type_ + << ", using_kernel_key:" << expected_kernel_key; + } + bool is_xpu_unsupport = + (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(type_)); + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 10ceae62dccbbab9329b73e0f581b51508511194..5de861235461ff6670503f6372961bdcf0be5ec2 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, proto::VarType::TensorDesc desc; { // int32_t size // proto buffer - int32_t size; + int32_t size = -1; is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable( + "Cannot read tensor desc size")); + PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument( + "Tensor desc size should >= 0")); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 149202468be6c6bec833f100adfd4100c520f8f3..7d60b7d26f3fbceaca9b19995ff2c5d29ad426b8 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -124,7 +124,7 @@ AmpOperators::AmpOperators() OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_ops_gpu_bf16.end()); -// NOTE: GPU/NPU/XPU is compiled seperatly. +// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. #elif defined(PADDLE_WITH_ASCEND_CL) auto unsupported_ops_npu_fp16 = std::get<2>( OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); @@ -143,6 +143,15 @@ AmpOperators::AmpOperators() OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_ops_xpu_bf16.end()); +#elif defined(PADDLE_WITH_MLU) + auto unsupported_ops_mlu_fp16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(), + unsupported_ops_mlu_fp16.end()); + auto unsupported_ops_mlu_bf16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16)); + unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(), + unsupported_ops_mlu_bf16.end()); #endif VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " << unsupported_fp16_ops_->size() << " " @@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr& var) { auto data_type = GetDataType(var); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place)) { + paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || data_type == paddle::framework::proto::VarType::FP16 || diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index bae49fb381a475dd8227d1dc855a6db28c9cd273..a427b9b8199116098d149689961cedf14e86e5e1 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_XPU +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() @@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } - #endif #ifdef PADDLE_WITH_XPU_KP - expected_kernel_key.place_ = platform::XPUPlace(); - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(op.Type()); - if (use_xpu_kp_kernel_rt) { - VLOG(3) << "xpu_kp using rt mode "; - } - if (use_xpu_kp_kernel_debug) { - VLOG(3) << "xpu_kp using debug mode "; - } - if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.place_ = platform::XPUPlace(); - expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << op.Type() - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << op.Type() + << ", using_kernel_key:" << expected_kernel_key; + } + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8deb3b93e9c50489dcfc6805063f23e3705cb634..16f2df79246f782ead9cc3177679674d98c3d1a9 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { - VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index c55599cc9aa954e2bd437f0917c792e4fdb6b577..d18c8e96c49b6a993fbd0a8d632212ae8d7f8c6d 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -390,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, } phi::KernelSignature Tracer::GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const { auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); framework::RuntimeContext ctx({}, {}); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -406,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature( attr_checker == nullptr ? empty_attrs_map : attr_checker->GetDefaultAttrMap(); auto dygraph_exe_ctx = - imperative::DygraphExecutionContext( + imperative::DygraphExecutionContext( *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto* opbase_with_kernel = diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7..f24961885c9b85b03c561f60f375b1a21bf086dd 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -156,8 +156,8 @@ class Tracer { } phi::KernelSignature GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const; paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index a9563db74d46a57353920e3512868e312dca6765..65080f04832d68dc7bb44a992b5d072bb3384813 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); +REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor, + HardShrinkGradFunctor); +REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, + SoftShrinkGradFunctor); +REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, + TanhShrinkGradFunctor); +REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1626,22 +1633,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL(elu, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - elu_grad, ops::ELUGradKernel, - ops::ELUGradKernel); -REGISTER_OP_CPU_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); - /* ========================================================================== */ /* ======================== logit register ============================ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 20adf9136aef9783060b0d4b66134be4f5a07619..8a19a79fd43dd2dda4be8077663739a6bb0e807b 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -280,6 +280,15 @@ USE_PHI_FUNCTOR(BRelu) USE_PHI_FUNCTOR(ThresholdedRelu) USE_PHI_FUNCTOR(LeakyRelu) USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) +USE_PHI_FUNCTOR(HardShrink) +USE_PHI_FUNCTOR(SoftShrink) +USE_PHI_FUNCTOR(TanhShrink) +USE_PHI_FUNCTOR(Silu) +USE_PHI_FUNCTOR(ELU) +USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) + +template +using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; template struct SigmoidGradFunctor : public BaseActivationFunctor { @@ -393,31 +402,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { } }; -// silu(x) = x / (1 + exp(-x)) -template -struct SiluFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); - out.device(d) = x * temp; - } -}; - -// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) -template -struct SiluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) - auto temp2 = x * (-x).exp(); // x*e^(-x) - dx.device(d) = dout * ((static_cast(1) / temp1) * - (static_cast(1) + (temp2 / temp1))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // Originally: logsigmoid(x) = -log (1 + exp(-x)) // For numerical stability, we can use the log-sum-exp trick: // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ @@ -922,59 +906,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct ELUFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - (x < static_cast(0)) - .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); - } -}; - -template -struct ELUGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - dx.device(d) = (out > static_cast(0)) - .select(dout, dout * (out + static_cast(alpha))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - dx.device(d) = (x > static_cast(0)) - .select(dout, dout * static_cast(alpha) * x.exp()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template class ELUGradKernel : public framework::OpKernel { public: @@ -1207,44 +1138,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ELUGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* ddX, framework::Tensor* ddOut, - const framework::Tensor* dOut, framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); - - if (dX) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); - dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast(); - } - - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -1984,26 +1877,22 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ - __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ - __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ - __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ - __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ - __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ - __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(mish, Mish, MishFunctor, MishGradFunctor); \ +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ + __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ + __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ + HardSigmoidGradFunctor); \ + __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ + __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 6e1ac642d11976c9fa50cd0a8e937c333d115bd3..17476c48e41ac3b28ae4a90e7f30e5ad64a42b6c 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { @@ -44,35 +46,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaSiluFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // silu(x) = x / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x / (one + exp(-x))); - } -}; - -template -struct CudaSiluGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp = one / (one + exp(-x)); - return static_cast(dout * (temp * (one + x * (one - temp)))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaLogSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -110,43 +83,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaSoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // softshrink(x) = x - lambda, if x > lambda; - // x + lambda, if x < -lambda; - // 0, otherwise. - __device__ __forceinline__ T operator()(const T x) const { - T l = static_cast(lambda); - T temp1 = static_cast(x > l); - T temp2 = static_cast(x < -l); - return temp1 * (x - l) + temp2 * (x + l); - } -}; - -template -struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // dx = dout, if x > lambda or x < -lambda else 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T l = static_cast(lambda); - return (x >= -l && x <= l) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -495,66 +431,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhShrinkFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanhshrink(x) = x - tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x - tanh(x)); - } -}; - -template -struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * tanh(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * tanh(x) * tanh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardShrinkFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - __device__ __forceinline__ T operator()(const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : x; - } -}; - -template -struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = (x > -threshold && x < threshold) ? 0 : dout - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSigmoidFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -722,110 +598,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaELUFunctor : public BaseActivationFunctor { - using CT = typename details::MPTypeTrait::Type; - CT zero = static_cast(0.0f); - CT one = static_cast(1.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // elu(x) = x, if x > 0 - // elu(x) = alpha * (e^x - 1), if x <= 0 - __device__ __forceinline__ T operator()(const T arg_x) const { - CT x = static_cast(arg_x); - CT temp = static_cast(alpha) * (exp(x) - one); - CT res = x > zero ? x : temp; - return static_cast(res); - } -}; - -template -struct CudaELUGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType a = static_cast(alpha); - MPType out_pos = static_cast(out > zero); - MPType out_neg = static_cast(out <= zero); - return static_cast(dout * (out_pos + out_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType x = static_cast(arg_x); - MPType a = static_cast(alpha); - MPType x_pos = static_cast(x > zero); - MPType x_neg = static_cast(x <= zero); - return static_cast(dout * (x_pos + x_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -class ELUGradCudaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - auto* x = ctx.Input("X"); - auto* d_x = ctx.Output(framework::GradVarName("X")); - d_x->mutable_data(ctx.GetPlace()); - const float alpha = ctx.Attr("alpha"); - - auto& dev_ctx = ctx.device_context(); - std::vector ins = {d_out, out}; - std::vector outs = {d_x}; - if (alpha > 0) { - CudaELUGradFunctor functor; - functor.alpha = alpha; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } else { - CudaELUGradNegativeAlphaFunctor functor; - functor.alpha = alpha; - ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } - } -}; - template struct CudaCELUFunctor : public BaseActivationFunctor { using CT = typename details::MPTypeTrait::Type; @@ -958,6 +730,15 @@ USE_PHI_FUNCTOR(CudaTanh) USE_PHI_FUNCTOR(CudaBRelu) USE_PHI_FUNCTOR(CudaLeakyRelu) USE_PHI_FUNCTOR(CudaThresholdedRelu) +USE_PHI_FUNCTOR(CudaHardShrink) +USE_PHI_FUNCTOR(CudaSoftShrink) +USE_PHI_FUNCTOR(CudaTanhShrink) +USE_PHI_FUNCTOR(CudaSilu) +USE_PHI_FUNCTOR(CudaELU) + +template +using CudaELUGradNegativeAlphaFunctor = + phi::funcs::CudaELUGradNegativeAlphaFunctor; } // namespace operators } // namespace paddle @@ -1017,26 +798,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== elu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - elu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - elu_grad, ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel); - -REGISTER_OP_CUDA_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); /* ========================================================================== */ /* ======================== celu register ============================ */ @@ -1213,7 +974,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ @@ -1244,63 +1004,221 @@ REGISTER_OP_CUDA_KERNEL( FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP -#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_KERNEL( \ - act_type, KP, plat::XPUPlace, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ - ops::ActivationGradCudaKernel>); - -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, - CudaReciprocalGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, - CudaSoftplusGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, - CudaHardSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, - CudaCELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, - CudaSiluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, - CudaLogSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, - CudaSoftShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, - CudaLog1pGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, - CudaBReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, - CudaSoftReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, - CudaSoftsignGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, - CudaRelu6GradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, - CudaHardShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, - CudaHardSigmoidFunctor, - CudaHardSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, - CudaSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, - CudaThresholdedReluFunctor, - CudaThresholdedReluGradFunctor); +REGISTER_OP_KERNEL( + brelu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + brelu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + ceil_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + celu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + elu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + exp_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + floor_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_shrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_shrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + leaky_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + leaky_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log1p_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + logsigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + logsigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + reciprocal, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + reciprocal_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu6_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + silu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + soft_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softplus_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + softshrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softshrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softsign_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sqrt_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(square, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + square_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + thresholded_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + thresholded_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); #endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 0e64b461786cce845f7388a520c09101dcba9c09..6507890a8b5dcd7a415215caf51bd05c2857db5e 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -20,6 +21,8 @@ namespace operators { template class MLUBatchNormOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto &place = ctx.GetPlace(); @@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel { // alloc memory y->mutable_data(place); - mean_out->mutable_data(place); - variance_out->mutable_data(place); - saved_mean->mutable_data(place); - saved_variance->mutable_data(place); + mean_out->mutable_data(place); + variance_out->mutable_data(place); + saved_mean->mutable_data(place); + saved_variance->mutable_data(place); Tensor transformed_x; Tensor transformed_y; @@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel { template class MLUBatchNormGradOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); auto d_x_tmp = ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto scale_grad_tmp = ctx.AllocateTmpTensor( + scale->dims(), dev_ctx); auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_x == nullptr) { d_x = &d_x_tmp; @@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { const auto &place = ctx.GetPlace(); d_x->mutable_data(place); - d_scale->mutable_data(place); - d_bias->mutable_data(place); + d_scale->mutable_data(place); + d_bias->mutable_data(place); use_global_stats = is_test || use_global_stats; diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index 90910bbbb2050bad85d10e0467a099c42030c084..889cdac8f6882744c7a7044861d237964e6f6ac0 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,14 +23,6 @@ namespace operators { class CumprodOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod"); - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } }; class CumprodOpMaker : public framework::OpProtoAndCheckerMaker { @@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker, - ops::CumprodGradOpMaker); + ops::CumprodGradOpMaker, + CumprodInferShapeFunctor); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 98247fbc862bbc199316a4d4c8971d0f4a159544..6959b5cf811069cc66321d2129a2b69d4e922f09 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,11 +24,6 @@ namespace operators { class DeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } }; class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", - "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", - framework::GradVarName("Input"), "DeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker); - -REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) + ops::DeterminantGradOpMaker, + DeterminantInferShapeFunctor); -REGISTER_OP_CPU_KERNEL(determinant, - ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CPU_KERNEL( - determinant_grad, ops::DeterminantGradKernel, - ops::DeterminantGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp, + DeterminantGradInferShapeFunctor); REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu index d19d4c3d093860c1f603e4d752063b7a858c0460..d8237fa3004e65ce74ece55e4d0e62b5564e5c5f 100644 --- a/paddle/fluid/operators/determinant_op.cu +++ b/paddle/fluid/operators/determinant_op.cu @@ -17,14 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - determinant, ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CUDA_KERNEL( - determinant_grad, - ops::DeterminantGradKernel, - ops::DeterminantGradKernel); REGISTER_OP_CUDA_KERNEL( slogdeterminant, ops::SlogDeterminantKernel, diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index f89ecd37222870f73d00870c9454bf5590d504e3..e6de0ee3548b7442ac5e059331502cac441020e5 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -23,10 +23,13 @@ #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -40,232 +43,6 @@ T sign(T val) { return static_cast(T(0) < val) - (val < T(0)); } -template -class EigenMatrix {}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXf; -}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXd; -}; - -inline int64_t GetBatchCount(const framework::DDim dims) { - int64_t batch_count = 1; - auto dim_size = dims.size(); - PADDLE_ENFORCE_GE( - dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - - // Cumulative multiplying each dimension until the last 2 to get the batch - // count, - // for example a tensor with shape [3,3,3,3], the batch count of matrices is - // 9. - for (int64_t i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - - return batch_count; -} - -template -struct DeterminantFunctor { - void operator()(const Tensor& input, const framework::ExecutionContext ctx, - int64_t rank, int64_t batch_count, Tensor* output) { - std::vector input_vec; - std::vector output_vec; - framework::TensorToVector(input, ctx.device_context(), &input_vec); - for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel - auto begin_iter = input_vec.begin() + i * rank * rank; - auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector sub_vec(begin_iter, - end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); - for (int64_t i = 0; i < rank; ++i) { - for (int64_t j = 0; j < rank; ++j) { - matrix(i, j) = sub_vec[rank * i + j]; - } - } - output_vec.push_back(matrix.determinant()); - } - framework::TensorFromVector(output_vec, output); - } -}; -template -class DeterminantKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("Input"); - auto input_dim = vectorize(input->dims()); - auto input_dim_size = input_dim.size(); - auto* output = context.Output("Out"); - - auto batch_count = GetBatchCount(input->dims()); - VLOG(2) << "input dim:" << input->dims(); - PADDLE_ENFORCE_GE( - input_dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], - input_dim[input_dim_size - 2], - platform::errors::InvalidArgument( - "the input matrix should be square matrix.")); - auto rank = input_dim[input_dim_size - 1]; // square matrix length - DeterminantFunctor()(*input, context, rank, batch_count, output); - auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2); - if (input_dim_size > 2) { - output->Resize(output_dims); - } else { - // when input is a two-dimension matrix, The det value is a number. - output->Resize({1}); - } - VLOG(2) << "output dim:" << output->dims(); - } -}; - -template -struct FoundZeroFunctor { - FoundZeroFunctor(const T* x, int64_t numel, bool* res) - : x_(x), numel_(numel), res_(res) {} - HOSTDEVICE void operator()(size_t idx) const { - if (*res_ || idx >= static_cast(numel_)) { - // founded zero number - return; - } - *res_ = (x_[idx] == static_cast(0)); - } - const T* x_; - int64_t numel_; - bool* res_; -}; - -template -inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx, - const framework::Tensor* det) { - auto& dev_ctx = ctx.template device_context(); - auto numel = det->numel(); - - framework::Tensor dev_tensor; - auto* data = dev_tensor.mutable_data({1}, ctx.GetPlace()); - - // set false - phi::funcs::SetConstant zero; - zero(dev_ctx, &dev_tensor, false); - - // find whether zero - platform::ForRange for_range(dev_ctx, numel); - FoundZeroFunctor functor(det->data(), numel, data); - for_range(functor); - - // copy to host - dev_ctx.Wait(); - framework::Tensor cpu_tensor; - framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor); - - // if founded zero, the matrix is not invertible - // else the matrix is invertible - auto* res = cpu_tensor.data(); - return !(*res); -} - -template -class DeterminantGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& orig_dev_ctx = context.template device_context(); - const auto* input = context.Input("Input"); - const auto* det = context.Input("Out"); - const auto* grad = - context.Input(framework::GradVarName("Out")); - auto* ddet = - context.Output(framework::GradVarName("Input")); - - auto input_dims_size = input->dims().size(); - if (input_dims_size > 2) { - PADDLE_ENFORCE_EQ( - grad->dims().size() + 2, input_dims_size, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else if (input_dims_size == 2) { - // input dims size 2 and grad dims size 1 is possible - PADDLE_ENFORCE_EQ( - grad->dims().size(), 1, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else { - // checked in forward, pass - } - - auto& dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE&>( - orig_dev_ctx); - - // Check Whether the matrix is invertible - // (matrix A not invertible) == (det(A)=0) - if (!CheckMatrixInvertible(context, det)) { - // The matrix is not invertible - VLOG(3) << "The input matrix not invertible!"; - ddet->Resize(input->dims()); - phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), - ddet); - return; - } - - // The matrix is invertible - // let |A| = Determinant(A) - // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf - // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, - // -1) - - // First: inverse(A) - framework::Tensor inverse_A; - // A must be square matrices! - inverse_A.Resize(input->dims()); - inverse_A.mutable_data(context.GetPlace()); - - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(orig_dev_ctx, *input, &inverse_A); - - VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); - - // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = - phi::TransposeLast2Dim(dev_ctx, inverse_A); - - VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " - << transpose_inverse_A.dims(); - - // Third: dA * |A| - auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); - VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); - - // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); - VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); - - // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); - - VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); - - framework::TensorCopy(res, context.GetPlace(), ddet); - - ddet->Resize(input->dims()); - VLOG(3) << "d|A| dims: " << ddet->dims(); - } -}; - template struct SlogDeterminantFunctor { void operator()(const Tensor& input, const framework::ExecutionContext ctx, @@ -280,7 +57,7 @@ struct SlogDeterminantFunctor { auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); + typename phi::detail::EigenMatrix::MatrixType matrix(rank, rank); for (int64_t i = 0; i < rank; ++i) { for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; @@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel { auto input_dim_size = input_dim.size(); auto* output = context.Output("Out"); - auto batch_count = GetBatchCount(input->dims()); + auto batch_count = phi::detail::GetBatchCount(input->dims()); VLOG(2) << "input dim:" << input->dims(); PADDLE_ENFORCE_GE( input_dim_size, 2, @@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); auto absslogdet_val = slogdet_vec[0]; - if (!CheckMatrixInvertible(context, &absslogdet_val)) { + if (!phi::detail::CheckMatrixInvertible< + T, typename framework::ConvertToPhiContext::TYPE>( + dev_ctx, &absslogdet_val)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 838df2e1625912dad127b672228f9cc64eb7cec3..f9347d281043ecc63acdb8ca2fb0a18dae4adc47 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,100 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -116,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseAddMKLDNNGradKernel, - ops::EltwiseAddMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index 367d602f5902e816a468d43ccfa009fe35a045fc..c68aa8d3d1b46c9013c6fe6a12510f0cdb744682 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -1,146 +1,28 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout / y - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = -dout * out / y - - platform::BinaryMKLDNNHandler y_handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, - y, nullptr, 1.0f, 1.0f, 1.0f); - - const auto y_memory = y_handler.AcquireSrcMemory(y); - - dnnl::post_ops po; - po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_out_memory = handler.AcquireSecondSrcMemory(out); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_out_memory}, - {DNNL_ARG_DST, *dst_dy_memory}, - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// TODO(piotrekobi) add int8, uint8 support -REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseDivMKLDNNGradKernel, - ops::EltwiseDivMKLDNNGradKernel) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL( + elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ad8fd317013908e8908dff8bea3440e24779454e..761b401ca9a2e535e1badfee834ef9ee98a07aae 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,20 +15,35 @@ #pragma once #include #include -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; + +inline std::vector CalculateBroadcastedDims(const Tensor* x, + const Tensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + size_t j = 0; + std::vector dst_tz_ex(src_tz.size(), 1); + for (size_t i = 0; i < src_tz.size(); ++i) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + + return dst_tz_ex; +} template class EltwiseMKLDNNKernel : public framework::OpKernel { @@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); - std::shared_ptr dst_memory = nullptr; + std::shared_ptr dst_memory; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather @@ -135,19 +150,193 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { } }; -inline std::vector CalculateBroadcastedDims(const Tensor* x, - const Tensor* y) { - const auto src_tz = phi::vectorize(x->dims()); - const auto dst_tz = phi::vectorize(y->dims()); +template +class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; - size_t j = 0; - std::vector dst_tz_ex(src_tz.size(), 1); - for (size_t i = 0; i < src_tz.size(); ++i) { - dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; - if (j == dst_tz.size()) break; - } + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); - return dst_tz_ex; -} + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + int axis = ctx.Attr("axis"); + + auto tz = phi::vectorize(dout->dims()); + auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + + platform::ReorderMKLDNNHandler reorder_handler( + tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout), + onednn_engine); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } + + // elementwise_mul & elementwise_div + else { + platform::BinaryMKLDNNHandler binary_handler( + BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, + 1.0f, 1.0f); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); + const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + + if (dy) { + dnnl::primitive_attr broadcast_reduction_attr; + std::shared_ptr broadcast_src_memory; + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales(1); + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + + dst_memory = reorder_dst_memory_p; + } else { + broadcast_src_memory = reorder_src_memory_p; + } + } + + // elementwise_mul & elementwise_div + else { + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + platform::BinaryMKLDNNHandler binary_handler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + platform::BinaryMKLDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(), + y, y, nullptr, 1.0f, 1.0f, 1.0f); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = platform::BinaryMKLDNNHandler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(dout); + + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + dy->set_layout(DataLayout::kMKLDNN); + + if (dout->dims() != dy->dims()) { + // Broadcasting + if (BINARY_OP == dnnl::algorithm::binary_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + broadcast_reduction_attr.set_post_ops(po); + } + + platform::ReductionMKLDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(dy); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *broadcast_src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims())))); + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + } + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index c03794012ff3b793684222c62f423edd6e8637f1..0ef5c5e628ce62084305fc95e66862a15822ecb3 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -1,127 +1,19 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout*y - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = dout*x - // Handler is having nullptr passed instead of output tensor as - // we want Dst buffer to be allocated by oneDNN not to use Tensor - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_x_memory = handler.AcquireSecondSrcMemory(x); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_x_memory}, - {DNNL_ARG_DST, *dst_dy_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -132,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseMulMKLDNNGradKernel, - ops::EltwiseMulMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 3c799008a2abcf3fc59da7b759c9d43f3e940e8e..510373831eb6db5c7ffed6e8e58cbfb0ae268a50 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -1,5 +1,4 @@ - -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,113 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - - dnnl::primitive_attr reorder_attr; - std::vector scales = {-1}; - reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - - dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); - dnnl::primitive_attr attr; - attr.set_post_ops(po); - - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); - - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - reduction_p->execute(astream, { - {DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}, - }); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; @@ -131,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseSubMKLDNNGradKernel, - ops::EltwiseSubMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 8a405cc6fc1baefe997fb5b6133a56d6a2fc0438..9f2b48a24b44700dc93e9eba09ea2dd2a900bdfa 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherOp should not be null.")); - - auto index_dims = ctx->GetInputDim("Index"); - - if (index_dims.size() == 2) { - PADDLE_ENFORCE_EQ( - index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of index should be 1 when it is 2D, but we get %d", - index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - index_dims.size(), 1, - platform::errors::InvalidArgument( - "The index should be 1D, when it is not 2D, but we get %d", - index_dims.size())); - } - - auto axis = ctx->Attrs().Get("axis"); - auto input_dim = ctx->GetInputDim("X"); - if (ctx->HasInput("Axis") || axis == 0) { - // if HasInput("Axis"), we can not obtain correct shape of output - int batch_size = index_dims[0]; - framework::DDim output_dims(input_dim); - output_dims[0] = batch_size; - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - int index_size = index_dims[0]; - std::vector out_dim_vec; - for (int i = 0; i < axis; i++) { - out_dim_vec.push_back(input_dim[i]); - } - out_dim_vec.push_back(index_size); - for (int i = axis + 1; i < input_dim.size(); i++) { - out_dim_vec.push_back(input_dim[i]); - } - auto output_dims = phi::make_ddim(out_dim_vec); - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor, + PD_INFER_META(phi::GatherInferMeta)); REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOpMaker, - ops::GatherGradOpMaker); + ops::GatherGradOpMaker, + GatherInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, - ops::GatherGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel); -REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradNoNeedBufferVarInferer, + GatherGradInferShapeFunctor); + REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu deleted file mode 100644 index e0db2f26d3e0534f924cc709b98689fb3f1a5cc6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op.cu +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - - // get axis from tensor - if (ctx.HasInput("Axis")) { - Tensor cpu_axis; - const Tensor *axis_tensor = ctx.Input("Axis"); - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT16) { - axis = static_cast(cpu_axis.data()[0]); - } - } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - const auto &dev_ctx = ctx.cuda_device_context(); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - Tensor cpu_axis; - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } - } - - const auto &dev_ctx = ctx.cuda_device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h deleted file mode 100644 index 94de694b2f9bc484cdb60298b60d5a9433dac181..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - // get axis from tensor - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *dev_ctx.eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - bool overwrite = ctx.Attr("overwrite"); - - if (index_type == phi::DataType::INT32) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } else if (index_type == phi::DataType::INT64) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 21093f585b59eea24a231b4dcdf264dc16178fbd..f996b1ede2f0fdbf7739d579380d71e9dc3448e7 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index 3dce380360815c292153ef2bfb1a447357c90acb..b42050eabe300bea59c95c50c356d9e115c0dddf 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -24,16 +24,15 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/gather_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gather); +USE_OP_ITSELF(gather); USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP(gather_grad); +USE_OP_ITSELF(gather_grad); USE_OP_DEVICE_KERNEL(gather_grad, NPU); template diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index 28f2f7d473bef308f581266bdb1925864aca4b78..6c691aa14ae77acc3c4ebc2077ea9182e4354d54 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -13,15 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class GatherOpXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 04aa6a3e10f6e3f55f9845d1b4b6bd6aa762c016..6ee9582dacde372886075bb7c5619c6bc1b99c98 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/grid_sampler_op.h" #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker); REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); -REGISTER_OP_CPU_KERNEL( - grid_sampler, - ops::GridSampleOpKernel, - ops::GridSampleOpKernel); -REGISTER_OP_CPU_KERNEL( - grid_sampler_grad, - ops::GridSampleGradOpKernel, - ops::GridSampleGradOpKernel); - REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu deleted file mode 100644 index a227a8e312765b4311314ea884f2c32443924fbc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ /dev/null @@ -1,492 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/grid_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -template -static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH, - int sW, int H, int W, - T delta) { - if (in_bounds(h, w, H, W)) { - platform::CudaAtomicAdd(data + h * sH + w * sW, delta); - } -} - -template -static __forceinline__ __device__ T _unnormalize(T coord, int size, - bool align_corners) { - if (align_corners) { - return ((coord + 1.f) / 2) * (size - 1); - } else { - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes(T in, int max_value) { - return min(static_cast(max_value), max(in, static_cast(0))); -} - -template -static __forceinline__ __device__ T reflect_indexes(T in, int twice_low, - int twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = fabs(in - min); - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T compute_positions(T coord, int size, - PaddingMode padding_mode, - bool align_corners) { - coord = _unnormalize(coord, size, align_corners); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes(coord, size - 1); - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes(coord, 0, 2 * (size - 1)); - } else { - coord = reflect_indexes(coord, -1, 2 * size - 1); - } - coord = clip_indexes(coord, size - 1); - } - return coord; -} - -template -static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size, - bool align_corners, - T* grad_in) { - if (align_corners) { - *grad_in = static_cast(size - 1) / 2; - return ((coord + 1.f) / 2) * (size - 1); - } else { - *grad_in = static_cast(size) / 2; - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit, - T* grad_in) { - if (in <= static_cast(0)) { - *grad_in = static_cast(0); - return static_cast(0); - } else { - T max = static_cast(clip_limit - 1); - if (in >= max) { - *grad_in = static_cast(0); - return max; - } else { - *grad_in = static_cast(1); - return in; - } - } -} - -template -static __forceinline__ __device__ T -reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast(0); - return static_cast(0); - } - int grad_in_mult_; - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast(-grad_in_mult_); - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T -compute_positions_with_mask(T coord, int size, PaddingMode padding_mode, - bool align_corners, T* grad_in) { - T grad_clip, grad_refl; - coord = _unnormalize_with_mask(coord, size, align_corners, grad_in); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl); - } else { - coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl); - } - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - - return coord; -} - -template -__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, - int out_h, int out_w, int in_h, - int in_w, const T* input, const T* grid, - T* output, const Mode mode, - const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - int out_sN = out_c * out_h * out_w; - int out_sC = out_h * out_w; - int out_sH = out_w; - int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - ix = compute_positions(ix, in_w, padding_mode, align_corners); - iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - auto inp_offset_NC = n * inp_sN; - - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast(0); - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - auto inp_offset_NC = n * inp_sN; - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) { - *out_ptr_NCHW = - input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } -} - -template -class GridSampleOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h - << "; out_w: " << out_w; - auto* output = ctx.Output("Output"); - auto* output_data = output->mutable_data(ctx.GetPlace()); - VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] - << "; " << output->dims()[2] << "; " << output->dims()[3]; - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sample_cuda_kernel< - T><<>>( - count, n, c, out_h, out_w, in_h, in_w, input->data(), - grid->data(), output_data, mode, padding_mode, align_corners); - } -}; - -template -__global__ void grid_sampler_cuda_backward_kernel( - const int nthreads, const T* grad_output, const T* input, const T* grid, - int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input, - T* grad_grid, const Mode mode, const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - - int gOut_sN = out_c * out_h * out_w; - int gOut_sC = out_h * out_w; - int gOut_sH = out_w; - int gOut_sW = 1; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - T gix_mult, giy_mult; - ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners, - &gix_mult); - iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners, - &giy_mult); - - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - T gix = static_cast(0), giy = static_cast(0); - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - int inp_offset_NC = n * inp_sN; - for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, - gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - T gOut = grad_output[gOut_offset]; - - atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, - nw * gOut); - atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, - ne * gOut); - atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, - sw * gOut); - atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, - se * gOut); - - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - for (int c = 0; c < out_c; - ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h, - in_w, grad_output[gOut_offset]); - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = static_cast(0); - gGrid_ptr_NHW[1] = static_cast(0); - } - } - } -} - -template -class GridSampleGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), - input_grad, static_cast(0)); - - T* grid_grad_data = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - auto* grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad_data = grid_grad->mutable_data(ctx.GetPlace()); - } - - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sampler_cuda_backward_kernel< - T><<>>( - count, output_grad->data(), input->data(), grid->data(), n, c, - out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, - padding_mode, align_corners); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel, - ops::GridSampleOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(grid_sampler_grad, - ops::GridSampleGradOpCUDAKernel, - ops::GridSampleGradOpCUDAKernel); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h deleted file mode 100644 index 93e96694270a458844bbcabf78f2559975902c2f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/grid_sampler_op.h +++ /dev/null @@ -1,600 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -enum class Mode { - bilinear, - nearest, -}; - -enum class PaddingMode { zeros, border, reflect }; - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -using Array3 = Eigen::DSizes; -using Array4 = Eigen::DSizes; - -template -static inline bool isInBound(T x, T y, T x_max, T y_max) { - if (x < 0 || x > x_max || y < 0 || y > y_max) { - return false; - } - return true; -} - -template -static inline void unnormalize(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - - if (!align_corners) { - auto factor = static_cast((max_val + 1) * 0.5); - grid_slice_t.device(place) = - (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); - } else { - auto factor = static_cast(max_val * 0.5); - grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; - } -} - -template -static inline void clip(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - if (padding_mode == "border") { - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } - } -} - -template -static inline void clipWithMask(const platform::CPUDeviceContext& ctx, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode, - Tensor* grid_slice, Tensor* grid_scale) { - auto& place = *ctx.eigen_device(); - grid_scale->mutable_data(grid_slice->dims(), ctx.GetPlace()); - - auto grid_slice_t = EigenTensor::From(*grid_slice); - auto factor = static_cast(max_val * 0.5); - if (!align_corners) { - factor = static_cast((max_val + 1) * 0.5); - } - auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); - - if (padding_mode == "border") { - // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); - auto res = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - - auto in_bound = (res == grid_slice_t); - grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); - grid_slice_t.device(place) = res; - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto is_neg = (grid_slice_t < static_cast(0)); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()); - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - auto reflected = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - auto clipped = reflected.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - auto in_bound = (clipped == reflected).template cast(); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()) * - in_bound; - grid_slice_t.device(place) = clipped; - } - } -} - -template -static void calcGridLocations(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); - clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); -} - -template -static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y, Tensor* grid_x_scale, - Tensor* grid_y_scale) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clipWithMask(ctx, in_w - 1, align_corners, padding_mode, grid_x, - grid_x_scale); - clipWithMask(ctx, in_h - 1, align_corners, padding_mode, grid_y, - grid_y_scale); -} - -template -static void getGridPointValue(const Tensor& input, Tensor* output, - const Tensor& x, const Tensor& y) { - const int n = input.dims()[0]; - const int c = input.dims()[1]; - const int in_h = input.dims()[2]; - const int in_w = input.dims()[3]; - const int out_h = x.dims()[1]; - const int out_w = x.dims()[2]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto output_t = EigenTensor::From(*output).setConstant((T)0); - auto input_t = EigenTensor::From(input); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - output_t(i, j, k, l) = - input_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))); - } - } - } - } - } -} - -template -static void allNeigbors(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* x_w, Tensor* x_e, Tensor* y_n, - Tensor* y_s, // positions - Tensor* d_w, Tensor* d_e, Tensor* d_n, - Tensor* d_s, // distance - Tensor* v_wn, Tensor* v_en, Tensor* v_ws, - Tensor* v_es) { // values - auto& place = *ctx.eigen_device(); - - const int c = input.dims()[1]; - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - // calculate coords of 4 corner points - x_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - x_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto x_w_t = EigenTensor::From(*x_w); - auto x_e_t = EigenTensor::From(*x_e); - auto y_n_t = EigenTensor::From(*y_n); - auto y_s_t = EigenTensor::From(*y_s); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - - x_w_t.device(place) = grid_x_t.floor(); - x_e_t.device(place) = x_w_t + static_cast(1); - y_n_t.device(place) = grid_y_t.floor(); - y_s_t.device(place) = y_n_t + static_cast(1); - - // calculate distances to 4 sides - d_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto d_w_t = EigenTensor::From(*d_w); - auto d_e_t = EigenTensor::From(*d_e); - auto d_n_t = EigenTensor::From(*d_n); - auto d_s_t = EigenTensor::From(*d_s); - d_w_t.device(place) = grid_x_t - x_w_t; - d_e_t.device(place) = x_e_t - grid_x_t; - d_n_t.device(place) = grid_y_t - y_n_t; - d_s_t.device(place) = y_s_t - grid_y_t; - - // calc 4 corner points value - v_wn->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_en->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_ws->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_es->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - getGridPointValue(input, v_wn, *x_w, *y_n); - getGridPointValue(input, v_en, *x_e, *y_n); - getGridPointValue(input, v_ws, *x_w, *y_s); - getGridPointValue(input, v_es, *x_e, *y_s); -} - -template -static void bilinearInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, - &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto d_w_scaled_t = - d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_e_scaled_t = - d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_n_scaled_t = - d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_s_scaled_t = - d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - auto output_t = EigenTensor::From(*out); - // bilinear interpolaetion by 4 corner points - output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + - v_en_t * d_w_scaled_t * d_s_scaled_t + - v_ws_t * d_e_scaled_t * d_n_scaled_t + - v_es_t * d_w_scaled_t * d_n_scaled_t; -} - -template -static void nearestInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(input, out, *grid_x, *grid_y); -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y, const Tensor& d1, - const Tensor& d2) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto d1_t = EigenTensor::From(d1); - auto d2_t = EigenTensor::From(d2); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); - } - } - } - } - } -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l); - } - } - } - } - } -} - -template -static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx, - const Tensor& input, const Tensor& output_grad, - Tensor* grid_x, Tensor* grid_y, - Tensor* grid_x_scale, Tensor* grid_y_scale, - Tensor* input_grad, Tensor* grid_grad) { - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, - grid_x, // grid_x - grid_y, // grid_y - &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en, - &v_ws, &v_es); - - // gather output grad value to input grad by corner point coords and weight - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); - - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto output_grad_t = EigenTensor::From(output_grad); - - if (grid_grad != nullptr) { - Tensor grid_grad_x, grid_grad_y; - grid_grad_x.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - grid_grad_y.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto grid_grad_x_t = - EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); - auto grid_grad_y_t = - EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - grid_grad_x_t(i, k, l) += - ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + - (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * - output_grad_t(i, j, k, l); - grid_grad_y_t(i, k, l) += - ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + - (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * - output_grad_t(i, j, k, l); - } - } - } - } - - // const T x_max = static_cast(in_w - 1); - // const T y_max = static_cast(in_h - 1); - - auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); - auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); - grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; - grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; - - // gather grid_grad [x, y] in 3rd Dim - T* grid_grad_data = grid_grad->data(); - T* grid_grad_x_data = grid_grad_x.data(); - T* grid_grad_y_data = grid_grad_y.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_grad_data[2 * i] = grid_grad_x_data[i]; - grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; - } - } -} - -template -class GridSampleOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* output = ctx.Output("Output"); - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), output, - static_cast(0)); - - Tensor grid_x, grid_y; - calcGridLocations( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y); - if (mode == "bilinear") { - bilinearInter( - ctx.template device_context(), *input, - &grid_x, &grid_y, output); - } else if (mode == "nearest") { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(*input, output, grid_x, grid_y); - } - } -}; - -template -class GridSampleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), input_grad, - static_cast(0)); - - Tensor* grid_grad = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad->mutable_data({n, out_h, out_w, 2}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), grid_grad, - static_cast(0)); - } - - Tensor grid_x, grid_y; - Tensor grid_x_scale, grid_y_scale; - calcGridLocationsWithGrad( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale, - &grid_y_scale); - if (mode == "bilinear") { - gatherBilinearGrad(ctx.template device_context(), - *input, *output_grad, &grid_x, &grid_y, - &grid_x_scale, &grid_y_scale, input_grad, - grid_grad); - } else { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - gatherOutputGradToInputGrad(*output_grad, input_grad, grid_x, grid_y); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index fea71edf41313f9a93c3a2a0311d0db69db3b41c..069cc9416a620cec987f6463841ecd677db8c7b4 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -13,8 +13,13 @@ // limitations under the License. #include "paddle/fluid/operators/index_select_op.h" + #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of IndexSelectOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto index_dim = ctx->GetInputDim("Index"); - auto dim = ctx->Attrs().Get("dim"); - - PADDLE_ENFORCE_EQ( - dim < input_dim.size() && dim >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_dim.size(), input_dim.size() - 1, dim)); - - PADDLE_ENFORCE_EQ( - index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), - true, platform::errors::InvalidArgument( - "The 'shape' of Input(Index) must be 1-D tensor. " - "But received: the 'shape' of Input(Index) is [%s], " - "the dimension of Input(Index) is [%d].", - index_dim, index_dim.size())); - - PADDLE_ENFORCE_EQ(index_dim[0] != 0, true, - platform::errors::InvalidArgument( - "The length of Input(Index) can't be 0.")); - - auto output_dim = phi::vectorize(input_dim); - if (dim < 0) { - dim += input_dim.size(); - } - output_dim[dim] = index_dim[0]; - ctx->SetOutputDim("Out", phi::make_ddim(output_dim)); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor, + PD_INFER_META(phi::IndexSelectInferMeta)); REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, ops::IndexSelectGradMaker, - ops::IndexSelectGradMaker); + ops::IndexSelectGradMaker, + IndexSelectInferShapeFunctor); REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, ops::IndexSelectGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - index_select, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel); -REGISTER_OP_CPU_KERNEL( - index_select_grad, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel); diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu deleted file mode 100644 index f810aee2adea540f1ffb6999ce38380ee05d0901..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_select_op.cu +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_select_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void index_select_cuda_kernel(const T* input, T* output, - const IndexT* index, int64_t N, - int64_t stride, int64_t size, - int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - output[idx] = input[input_idx]; -} - -template -__global__ void index_select_grad_cuda_kernel(const T* output_grad, - T* input_grad, - const IndexT* index, int64_t nums, - int64_t N, int64_t stride, - int64_t size, int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); -} - -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - -template -class IndexSelectCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* index = context.Input("Index"); - auto* out = context.Output("Out"); - int dim = context.Attr("dim"); - auto input_dim = in->dims(); - auto output_dim = out->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = out->numel(); - - auto stream = - context.template device_context().stream(); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -template -class IndexSelectGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* output_grad = context.Input(framework::GradVarName("Out")); - auto* in_grad = context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - - auto* output_grad_data = output_grad->data(); - auto* in_grad_data = in_grad->mutable_data(context.GetPlace()); - - int dim = context.Attr("dim"); - auto input_dim = in_grad->dims(); - auto output_dim = output_grad->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - int64_t numel = in_grad->numel(); - int64_t index_nums = index->numel(); - int64_t out_nums = output_grad->numel(); - - auto stream = - context.template device_context().stream(); - - index_select_grad_init< - T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_select, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - index_select_grad, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel); diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h index 04b4f69add78513bf716ab03d3bc2ba86dfbad2d..684829be2697cdc1676e8b80e15b2d600d922f3b 100644 --- a/paddle/fluid/operators/index_select_op.h +++ b/paddle/fluid/operators/index_select_op.h @@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context, output->Resize(output_dim); } -template -class IndexSelectKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto inputs = *context.Input("X"); - auto* index = context.Input("Index"); - auto* output = context.Output("Out"); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += inputs.dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectInner(context, &inputs, *index, output, - dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectInner(context, &inputs, *index, - output, dim); - } - } -}; - template struct IndexSelectAdd { void operator()(const framework::ExecutionContext& ctx, int slice_size, @@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context, x_grad->Resize(output_dim); } -template -class IndexSelectGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_grad = - context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += out_grad->dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectGradInner(context, *out_grad, *index, - x_grad, dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectGradInner(context, *out_grad, - *index, x_grad, dim); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index bce7a3c1caae39d21c9324b0f927401317284cc5..a232fba7e28d68c2df8394caa6bc5d93397f1f37 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class IndexSelectNPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 8668de4d3a6288841ad191f3e47b87a76eeb1d63..1c79213757fdfa8d9ef0d7c7ab315d03f94b0c57 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -14,10 +14,13 @@ #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", input_dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor, + PD_INFER_META(phi::ValueCompareInferMeta)); REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::IscloseOpVarTypeInference); + ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor); diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index dcd98054b05c314da0884e8dc6be358d3afb0483..67c1942ea0b41e480c524f9c188b2a82649ba44e 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -11,7 +11,9 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,44 +23,6 @@ using framework::Tensor; class KLDivLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_target = ctx->GetInputDim("Target"); - PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(), - platform::errors::InvalidArgument( - "Input(X) rank and Input(Target) rank should be " - "same, but received X rank(%d) != Target rank(%d)", - dim_x.size(), dim_target.size())); - for (int i = 0; i < dim_x.size(); i++) { - if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) { - PADDLE_ENFORCE_EQ( - dim_x[i], dim_target[i], - platform::errors::InvalidArgument( - "Input(X) and Input(Target) should in same shape. but received " - "X dimension[%d](%d) != Target dimension[%d](%d)", - i, dim_x[i], i, dim_target[i])); - } - } - - auto reduction = ctx->Attrs().Get("reduction"); - - auto reduction_valid = "mean" == reduction || "sum" == reduction || - "batchmean" == reduction || "none" == reduction; - PADDLE_ENFORCE_EQ( - reduction_valid, true, - platform::errors::InvalidArgument( - "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); - - if ("none" == reduction) { - ctx->SetOutputDim("Loss", dim_x); - } else { - ctx->SetOutputDim("Loss", {1}); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor, + PD_INFER_META(phi::KLDivInferMeta)); + REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker, - ops::KLDivLossOpGradMaker); + ops::KLDivLossOpGradMaker, + KLDivInferShapeFunctor); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 65297abe3e49b8f0fe85e4102d8f449c2c02788f..88d70d9bb7dae50f9ca0d82ce53896632b8b00ed 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel { auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); auto dl = framework::StringToDataLayout(data_format); - // Some models may have intentionally set "AnyLayout" for pool + // Some models may have intentionally set "AnyLayout" for lrn // op. Treat this as NCHW (default data_format value) if (dl != framework::DataLayout::kAnyLayout) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 214b2eccae9f75e9bfcfa3df0b823918e2b0c353..6e2ac4617da4df8e4ebaf92d4193ef8b3368b97a 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -18,8 +18,8 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" @@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, const auto W = udims[udims.size() - 1]; auto L_dataptr = L->mutable_data(dev_ctx.GetPlace()); platform::ForRange x_for_range(dev_ctx, LU->numel()); - TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, + L_dataptr); x_for_range(tril_computer); - TrilTriuCompute triu_computer(LU->data(), 0, false, H, W, - U->mutable_data(dev_ctx.GetPlace())); + phi::funcs::TrilTriuCompute triu_computer( + LU->data(), 0, false, H, W, U->mutable_data(dev_ctx.GetPlace())); x_for_range(triu_computer); // set L's diagonal 1 @@ -532,15 +533,15 @@ class LUGradKernel : public framework::OpKernel { auto phil_rank = LmHdims.size(); auto phiu_rank = UmHdims.size(); platform::ForRange l_for_range(dev_ctx, phi_L.numel()); - TrilTriuCompute tril_computer(phi_L.data(), -1, true, - LmHdims[phil_rank - 2], - LmHdims[phil_rank - 1], phi_L.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_L.data(), -1, true, LmHdims[phil_rank - 2], + LmHdims[phil_rank - 1], phi_L.data()); l_for_range(tril_computer); platform::ForRange u_for_range(dev_ctx, phi_U.numel()); - TrilTriuCompute triu_computer(phi_U.data(), 0, false, - UmHdims[phiu_rank - 2], - UmHdims[phiu_rank - 1], phi_U.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_U.data(), 0, false, UmHdims[phiu_rank - 2], + UmHdims[phiu_rank - 1], phi_U.data()); u_for_range(triu_computer); Tensor_Add(dev_ctx, phi_L, phi_U, &phi); @@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute tril_computer(phi_complement.data(), -1, true, H, - W, phi_complement_l.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_complement.data(), -1, true, H, W, + phi_complement_l.data()); x_for_range(tril_computer); Tensor_Sub(dev_ctx, phi, phi_complement_l, &phi); @@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute triu_computer(phi_complement.data(), 0, false, H, W, - phi_complement_u.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_complement.data(), 0, false, H, W, phi_complement_u.data()); x_for_range(triu_computer); Tensor_Sub(dev_ctx, phi, phi_complement_u, &phi); diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h index d2303f2c08da8e98053e314f9756e4e375e27775..e4100867dc685ef68cd01b22ab7972aa8b436a06 100644 --- a/paddle/fluid/operators/lu_unpack_op.h +++ b/paddle/fluid/operators/lu_unpack_op.h @@ -16,7 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lu_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" namespace paddle { namespace operators { @@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { auto W = ldims[ldims.size() - 1]; auto L_dataptr = dl_tril.mutable_data(dev_ctx.GetPlace()); platform::ForRange l_for_range(dev_ctx, dl->numel()); - TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, + L_dataptr); l_for_range(tril_computer); const auto udims = du->dims(); @@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { W = udims[udims.size() - 1]; auto U_dataptr = du_triu.mutable_data(dev_ctx.GetPlace()); platform::ForRange u_for_range(dev_ctx, du->numel()); - TrilTriuCompute triu_computer(du->data(), 0, false, H, W, U_dataptr); + phi::funcs::TrilTriuCompute triu_computer(du->data(), 0, false, H, W, + U_dataptr); u_for_range(triu_computer); auto xdims = dx->dims(); diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index cdf204628b638f877c92e35a8941487aa39b5427..56f65340ea999f48702294f912c4354d83990881 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -14,8 +14,11 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,26 +26,6 @@ namespace operators { class MatrixPowerOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power"); - auto dims = ctx->GetInputDim("X"); - auto n_dim = dims.size(); - PADDLE_ENFORCE_GE(n_dim, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions. But " - "received a %d dimension tensor.", - n_dim)); - PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - dims[n_dim - 2], dims[n_dim - 1])); - ctx->SetOutputDim("Out", dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,9 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor, + PD_INFER_META(phi::MatrixPowerInferMeta)); + REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerOpInferVarType, ops::MatrixPowerGradOpMaker, - ops::MatrixPowerGradOpMaker); + ops::MatrixPowerGradOpMaker, + MatrixPowerInferShapeFunctor); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index bdb4fe1198a8e550cb22cf4d727f6b7da34c28fe..86ecb01c89af7e12cc2a3dbcf91740d65d0ac247 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -50,13 +50,8 @@ class PReluMKLDNNHandler if (weights->dims().size() != x->dims().size()) { auto new_weights_dims = std::vector(x->dims().size(), 1); if (mode == "channel") { - if (data_format == "NHWC") { - new_weights_dims[x->dims().size() - 1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } else { - new_weights_dims[1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } + new_weights_dims[1] = + *std::max_element(weights_dims.begin(), weights_dims.end()); } weights_dims = std::move(new_weights_dims); } diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 717af61b858dc16f9bdda20f530cbf06a09908eb..0e988557df6262d4f924323084daf062aee75e0c 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -98,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) { TEST(test_pool2d_relu_relu_nhwc, cpu_place) { framework::DDim dims({1, 4, 8, 512}); // NHWC shape - framework::DDim expected_dims({1, 512, 3, 7}); // NHWC expected shape + framework::DDim expected_dims({1, 512, 3, 7}); // NCHW expected shape platform::CPUPlace p; framework::Scope scope; diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index c7fb92cd5107cee12e0995948e320ef3ed616f4d..9c16ccb138f7da56568ce6224dc30deb5bbccb7f 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of ModeOp must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of ModeOp must have >= 1d shape")); - if (axis < 0) axis += dim_size; - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input shape should >= 1d")); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor, + PD_INFER_META(phi::ModeInferMeta)); REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker, ops::ModeGradOpMaker, - ops::ModeGradOpMaker); -REGISTER_OP_CPU_KERNEL(mode, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel); - + ops::ModeGradOpMaker, + ModeInferShapeFunctor); REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad); -REGISTER_OP_CPU_KERNEL( - mode_grad, ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel); diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu deleted file mode 100644 index 2bacda8afb0eb340c4c8d4068f3013e2adbc7f91..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/mode_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" - -namespace paddle { -namespace operators { - -int ComputeBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -void getModebySort(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, - const int64_t num_cols, const int64_t num_rows, - T* out_tensor, int64_t* indices_tensor) { - framework::Tensor input_tmp; - framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp); - T* input_tmp_data = input_tmp.mutable_data(ctx.GetPlace()); - input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); - thrust::device_ptr out_tensor_ptr(out_tensor); - thrust::device_ptr indices_tensor_ptr(indices_tensor); - - for (int64_t i = 0; i < num_rows; ++i) { - T* begin = input_tmp_data + num_cols * i; - T* end = input_tmp_data + num_cols * (i + 1); - thrust::device_vector indices_data(num_cols); - thrust::sequence(thrust::device, indices_data.begin(), - indices_data.begin() + num_cols); - thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); - int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1, - begin + 1, 0, thrust::plus(), - thrust::not_equal_to()); - thrust::device_vector keys_data(unique); - thrust::device_vector cnts_data(unique); - thrust::reduce_by_key(thrust::device, begin, end, - thrust::constant_iterator(1), keys_data.begin(), - cnts_data.begin()); - auto it = thrust::max_element(thrust::device, cnts_data.begin(), - cnts_data.begin() + unique); - T mode = keys_data[it - cnts_data.begin()]; - int64_t counts = cnts_data[it - cnts_data.begin()]; - auto pos = thrust::find(thrust::device, begin, end, mode); - int64_t index = indices_data[pos - begin + counts - 1]; - out_tensor_ptr[i] = static_cast(mode); - indices_tensor_ptr[i] = static_cast(index); - } -} - -template -class ModeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - getModebySort(dev_ctx, input, input_width, input_height, output_data, - indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - for (int i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - // second step, tranpose the input - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, ctx.GetPlace()); - int ndims = trans_axis.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans_axis); - framework::Tensor trans_ind; - int64_t* trans_ind_data = - trans_ind.mutable_data(trans_out_shape, ctx.GetPlace()); - framework::Tensor trans_out; - T* trans_out_data = - trans_out.mutable_data(trans_out_shape, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - getModebySort(dev_ctx, &trans_input, input_width, input_height, - trans_out_data, trans_ind_data); - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans_axis); - TransCompute(ndims, dev_ctx, trans_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - - if (axis < 0) axis += in_dims.size(); - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - int block_size = ComputeBlockSize(post); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mode, ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - mode_grad, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel); diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h deleted file mode 100644 index 76d356ed16eb3f81b10d541230f49b73fd836543..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.h +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -static void getMode(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - T mode = 0; - int64_t indice = 0; - int64_t cur_freq = 0; - int64_t max_freq = 0; - for (int64_t i = 0; i < input_width; ++i) { - ++cur_freq; - if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { - if (cur_freq > max_freq) { - max_freq = cur_freq; - mode = col_vec[i].first; - indice = col_vec[i].second; - } - cur_freq = 0; - } - } - t_out[i] = mode; - t_indices[i] = indice; - } -} - -template -static void ModeAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class ModeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - bool keepdim = static_cast(context.Attr("keepdim")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - // if axis is not the last dim, transpose it to the last dim, do the - // calculation, - // then tranpose it back to orginal axis. - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getMode(input_height, input_width, in_dims.size(), input, - output_data, indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - // get the trans input_dims, out_dims - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_input, trans_axis); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_shape, context.GetPlace()); - framework::Tensor tmp_indices; - auto* t_ind = tmp_indices.mutable_data(trans_out_shape, - context.GetPlace()); - - getMode(input_height, input_width, in_dims.size(), - &trans_input, t_out, t_ind); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans_axis); - TransCompute(ndims, dev_context, tmp_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - // allocate the memory for the input_grad - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - if (keepdim) { - ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices, - x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - framework::DDim trans_shape(out_dims); - framework::DDim trans_in_shape(in_dims); - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = out_dims[trans_axis[i]]; - trans_in_shape[i] = in_dims[trans_axis[i]]; - } - // transpose the out_grad, indices - framework::Tensor trans_dO; - trans_dO.mutable_data(trans_shape, context.GetPlace()); - framework::Tensor trans_ind; - trans_ind.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - if (keepdim) { - // Do transpose - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans_axis); - } else { - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - // Do transpose - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans_axis); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); - const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; - - // Assign the out_grad to tranpose input_grad - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_shape, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - ModeAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans_axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index b309e1b87ef9033bd4302cdad4ea60a64cbf02eb..5b107ce643df33af79230c30d784d1ad84c26666 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -16,77 +16,19 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -/** - * @brief compute the output shape and check the input shape valid or not - */ -inline framework::DDim ComputeAndCheckShape( - const bool is_runtime, const std::vector& inputs_dims) { - const size_t n = inputs_dims.size(); - auto first_dim = inputs_dims[0]; - - bool is_vector = false; - framework::DDim out_dim; - - PADDLE_ENFORCE_LT( - first_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the first tensor is 1D of size n view it as a row vector (1, n) - if (first_dim.size() == 1) { - first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); - is_vector = true; - } - - auto last_dim = inputs_dims[n - 1]; - PADDLE_ENFORCE_LT( - last_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the last tensor is 1D of size n view it as a column vector (n, 1) - if (last_dim.size() == 1) { - last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); - out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); - } else { - out_dim = is_vector ? phi::make_ddim({last_dim[1]}) - : phi::make_ddim({first_dim[0], last_dim[1]}); - } - - auto width = first_dim[1]; - for (size_t i = 1; i < n - 1; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast(2), - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); - - const auto& tmp_dim = inputs_dims[i]; - PADDLE_ENFORCE_EQ( - tmp_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - width = tmp_dim[1]; - } - - PADDLE_ENFORCE_EQ( - last_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - - return out_dim; -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument class MultiDotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(1), - platform::errors::InvalidArgument( - "The number of input tensors in multi_dot op should > 1.")); - auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", "Out"); - } }; class MultiDotOpGrad : public framework::OperatorWithKernel { @@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor, + PD_INFER_META(phi::MultiDotInferMeta)); + REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, ops::MultiDotOpGradMaker, - ops::MultiDotOpGradMaker); + ops::MultiDotOpGradMaker, + MultiDotInferShapeFunctor); + REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 313a479ea301bb2c7dac0d0a27ca6064de99536a..8771a6573cba044d182aced752d3a65c446ad32e 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/multiplex_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ops::MultiplexGradMaker, ops::MultiplexGradMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); -REGISTER_OP_CPU_KERNEL( - multiplex, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel); -REGISTER_OP_CPU_KERNEL( - multiplex_grad, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu deleted file mode 100644 index 0a32ee96fb6938157364dc717724ce9193286f27..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/multiplex_op.cu +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/multiplex_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MultiplexGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto* ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T), stream); - } - } -}; - -template -class MultiplexGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T), stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - multiplex, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel); -REGISTER_OP_CUDA_KERNEL( - multiplex_grad, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h deleted file mode 100644 index 1d0a009edeedcad746853bb286af52cce474df87..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/multiplex_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace operators { - -template -class MultiplexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - auto index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T)); - } - } -}; - -template -class MultiplexGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = - ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - auto* index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T)); - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f7a3b82acf19fa79cbf5c632977e6ae533ae12b --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/number_count_op.h" + +namespace paddle { +namespace operators { + +class NumberCountOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx", + "NumberCount"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count", + "NumberCount"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // the dtype of the gate_idx should be same as int64 + auto gate_idx_dtype = + OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx"); + + PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64, + platform::errors::InvalidArgument( + "The dtype of the gate_idx_dtype should be int64")); + return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace()); + } +}; + +class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("gate_idx", "(Tensor) The input gate index tensor."); + AddOutput("Out", "(Tensor) The output expert count tensor."); + AddAttr("upper_range", "(int), The number of experts."); + + AddComment(R"DOC(number_count Operator.count gate indices.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel, + ops::NumberCountOpCPUKernel); + +REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp, + ops::NumberCountOpMaker); diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..97e4b4f2845ae132c28d3bb71dcc8e73f02e193a --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/number_count_op.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1) +#define PERTHREAD_EXPERTS 256 +#define WARP_SIZE 32 + +const int CUDA_NUM_THREADS = 512; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +__global__ void initialize_zero_kernel(T* data, const int length) { + CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast(0); } +} + +template +__global__ void NumberCount(const T* gate_idx, T* number_count, + int64_t batch_size, int upper_range) { + int res_tmp[PERTHREAD_EXPERTS] = {0}; + int expert_min = blockIdx.x * PERTHREAD_EXPERTS; + int expert_max = expert_min + PERTHREAD_EXPERTS; + if (expert_max > upper_range) { + expert_max = upper_range; + } + for (int i = threadIdx.x; i < batch_size; i += blockDim.x) { + T idx = gate_idx[i]; + if (idx == -1) { + continue; + } + if (idx < expert_min || idx >= expert_max) { + continue; + } + res_tmp[idx - expert_min] += 1; + } + for (int i = expert_min; i < expert_max; ++i) { + int x = res_tmp[i - expert_min]; +#pragma unroll + for (int j = 1; j < WARP_SIZE; j <<= 1) { +#ifdef __HIPCC__ + x = x + __shfl_down(x, j); +#else + x = x + __shfl_down_sync(-1u, x, j); +#endif + } + if (threadIdx.x % WARP_SIZE == 0) { + platform::CudaAtomicAdd(number_count + i, x); + } + } +} + +template +class NumberCountOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto gate_idx = context.Input("gate_idx"); + auto upper_range = context.Attr("upper_range"); + auto number_count = context.Output("Out"); + + int64_t batch_size = gate_idx->numel(); + auto place = context.GetPlace(); + const auto& dev_ctx = + context.template device_context(); + + framework::DDim out_dims = phi::make_ddim({upper_range}); + auto out_data = number_count->mutable_data(out_dims, place); + const T* gate_data = gate_idx->data(); + + initialize_zero_kernel< + T><<>>( + out_data, upper_range); + + NumberCount< + T><<>>( + gate_data, out_data, batch_size, upper_range); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel); diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h new file mode 100644 index 0000000000000000000000000000000000000000..95e64946fb8a2156fdb4cbae880ccf2c143447ed --- /dev/null +++ b/paddle/fluid/operators/number_count_op.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class NumberCountOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support expert count op for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 9bd6ae8bab829e2f200f697e10e7e54f398f8d73..4d2a2e23b3f70dc48029be2e0a79c9695881b519 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -17,6 +17,26 @@ limitations under the License. */ namespace paddle { namespace operators { +framework::OpKernelType innerGetKernelTypeForVar( + const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { +#ifdef PADDLE_WITH_MKLDNN + auto isOneDNNKernelChosen = + (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN); + auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN); + auto isModelNHWC = + (paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC); + // All inputs (including alpha) need shape rotating + if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); +} + class PReluOp : public framework::OperatorWithKernel { public: PReluOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -53,7 +73,7 @@ class PReluOp : public framework::OperatorWithKernel { "For mode 'channel', data_format must be one of " "NCHW and NHWC. But recevied data_format: %s", data_format_str)); - if (data_format_str == "NCHW") { + if (data_format_str == "NCHW" || ctx->IsRunMKLDNNKernel()) { PADDLE_ENFORCE_EQ( product(ctx->GetInputDim("Alpha")) == x_dim[1], true, platform::errors::InvalidArgument( @@ -128,6 +148,12 @@ class PReluOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; class PReluOpMaker : public framework::OpProtoAndCheckerMaker { @@ -212,6 +238,12 @@ class PReluGradOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; template diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 40e3cbde3b00917ee5952b8aebd412b357683018..82fc9ef1b7858992c49f537ce8608856ef6b6fde 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr_grad, ops::QrGradOp); -REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); - REGISTER_OP_CPU_KERNEL( qr_grad, ops::QrGradKernel, ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index f09a07e96cd34e1b631ef9484fe23b12a3b58543..5ef02d8942797a720d18358d425cf45f77be82ad 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -48,85 +48,6 @@ static inline std::tuple _parse_qr_mode(std::string mode) { return std::make_tuple(compute_q, reduced); } -template -class QrCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool compute_q; - bool reduced_mode; - const Tensor& x = *context.Input("X"); - Tensor& q = *context.Output("Q"); - Tensor& r = *context.Output("R"); - std::string mode = context.Attr("mode"); - std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); - - auto numel = x.numel(); - PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( - "The input of QR is empty.")); - auto x_dims = x.dims(); - int x_rank = x_dims.size(); - int m = x_dims[x_rank - 2]; - int n = x_dims[x_rank - 1]; - int min_mn = std::min(m, n); - int k = reduced_mode ? min_mn : m; - int batch_size = numel / (m * n); - int x_stride = m * n; - int q_stride = m * k; - int r_stride = k * n; - - auto* x_data = x.data>(); - T* q_data = nullptr; - if (compute_q) { - q_data = q.mutable_data>( - context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - } - auto* r_data = r.mutable_data>( - context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - - // Implement QR by calling Eigen - for (int i = 0; i < batch_size; ++i) { - const T* x_matrix_ptr = x_data + i * x_stride; - T* r_matrix_ptr = r_data + i * r_stride; - using EigenDynamicMatrix = - Eigen::Matrix; - auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); - Eigen::HouseholderQR qr(x_matrix); - if (reduced_mode) { - auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); - auto r_matrix_view = - qr_top_matrix.template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } else { - auto r_matrix_view = - qr.matrixQR().template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } - - if (compute_q) { - T* q_matrix_ptr = q_data + i * q_stride; - if (reduced_mode) { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } else { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, m); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } - } - } - } -}; - template class QrGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index ac0cd75237baf5e8b860f197d42cd27bae65270e..bf78b6a696559cab152a6de2c4730a32dfdbb780 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::NotFound("Input(ROIs) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of ROIAlignOp " - "is not found.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ( - rois_num_dims.size(), 1, - platform::errors::InvalidArgument("The size of RoisNum should be 1" - ", but received size = %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ( - input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " - "But received rank = %d", - input_dims.size())); - PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument( - "The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", - rois_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(rois_dims[1], 4, - platform::errors::InvalidArgument( - "The second dimension " - "of Input(ROIs) should be 4. But received the " - "dimension = %d", - rois_dims[1])); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " - "invalid. The height must be greater than 0. But " - "received 'pooled_height' = %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " - "invalid. The width must be greater than 0. But " - "received 'pooled_width' = %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " - "invalid. The scale must be greater than 0. But " - "received 'spatial_scale' = %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor, + PD_INFER_META(phi::RoiAlignInferMeta)); + REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker, - ops::ROIAlignGradMaker); + ops::ROIAlignGradMaker, + RoiAlignInferShapeFunctor); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align_grad, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel); REGISTER_OP_VERSION(roi_align) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu deleted file mode 100644 index 1a2e64cd45ca401f5fb8ca6b6975a029ba735280..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_align_op.cu +++ /dev/null @@ -1,227 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_align_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; -static constexpr int kROISize = 4; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__device__ void BilinearInterpolateGradient(const int height, const int width, - T y, T x, T* w1, T* w2, T* w3, - T* w4, int* x_low, int* x_high, - int* y_low, int* y_high) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return; - } - - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - *y_low = static_cast(y); - *x_low = static_cast(x); - if (*y_low >= height - 1) { - *y_high = *y_low = height - 1; - y = static_cast(*y_low); - } else { - *y_high = *y_low + 1; - } - if (*x_low >= width - 1) { - *x_high = *x_low = width - 1; - x = static_cast(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low, lx = x - *x_low; - T hy = 1. - ly, hx = 1. - lx; - *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; - - return; -} - -template -__global__ void GPUROIAlignBackward( - const int nthreads, const T* input_rois, const T* out_grad, - const int num_rois, const float spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, - T* input_grad, const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? T(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - T* offset_input_grad = - input_grad + (roi_batch_ind * channels + c) * height * width; - - const T* offset_out_grad = - out_grad + (n * channels + c) * pooled_height * pooled_width; - const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - const T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - int x_low = -1, x_high = -1, y_low = -1, y_high = -1; - BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, - &x_low, &x_high, &y_low, &y_high); - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, - diff1); - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, - diff2); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, - diff3); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, - diff4); - } - } - } - } -} - -template -class GPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - auto roi_ptr = - memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - int bytes = roi_batch_id_list.numel() * sizeof(int); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIAlignBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), rois_num, - spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace()), - aligned); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align_grad, - ops::GPUROIAlignGradOpKernel, - ops::GPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h deleted file mode 100644 index 589e35e4ab7ae4caf5efd3fb4d93a26b2ca86b26..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_align_op.h +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -void bilinear_interpolate_gradient(const int height, const int width, T y, T x, - const T out_grad_this_bin, const T count, - T* batch_grad_data) { - int x_low, y_low, x_high, y_high; - T w1, w2, w3, w4; - if (y < -1.0 || y > height || x < -1.0 || x > width) { - w1 = w2 = w3 = w4 = 0; - x_low = x_high = y_low = y_high = -1; - return; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - y_low = static_cast(y); - x_low = static_cast(x); - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - *(batch_grad_data + y_low * width + x_low) += diff1; - *(batch_grad_data + y_low * width + x_high) += diff2; - *(batch_grad_data + y_high * width + x_low) += diff3; - *(batch_grad_data + y_high * width + x_high) += diff4; - } -} - -template -class CPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - in_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - - if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) { - return; - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - - auto in_stride = phi::stride(in->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - for (int c = 0; c < channels; ++c) { - T* batch_grad_data = - in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1]; - const T* batch_out_grad_data = - out_grad_data + n * out_stride[0] + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - T out_grad_this_bin = batch_out_grad_data[pool_index]; - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - bilinear_interpolate_gradient(height, width, y, x, - out_grad_this_bin, count, - batch_grad_data); - } - } - } - } - } - rois_data += roi_stride[0]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index f82510556fde87fbf4aeb1904e29325358598791..898db4c22fed9cc97baa261b5b512a889290aff3 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/roll_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of RollOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of RollOp should not be null.")); - - auto dims = ctx->Attrs().Get>("axis"); - auto shifts = ctx->Attrs().Get>("shifts"); - - if (!ctx->HasInput("ShiftsTensor")) { - if (dims.size() != 0) { - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "When dims.size() != 0, dims.size() " - "should be equal to " - "shifts.size(). But received " - "dims.size() = %d, shifts.size() = %d", - dims.size(), shifts.size())); - } else { - PADDLE_ENFORCE_EQ(shifts.size(), 1, - platform::errors::InvalidArgument( - "When dims.size() == 0, shifts.size() " - "should be equal to 1, But received " - "shifts.size() = %d", - shifts.size())); - } - } - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor, + PD_INFER_META(phi::RollInferMeta)); + REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker, ops::RollGradMaker, - ops::RollGradMaker); + ops::RollGradMaker, + RollInferShapeFunctor); REGISTER_OPERATOR(roll_grad, ops::RollGradOp, ops::RollGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CPU_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); REGISTER_OP_VERSION(roll) .AddCheckpoint( diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu deleted file mode 100644 index b9064c5450f9fbed64bcb65a2f9d15be2b56fbcf..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roll_op.cu +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/roll_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/utils/array.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void RollCudaKernel(const T* input, T* output, int64_t N, - phi::Array shifts, - phi::Array strides, - phi::Array sizes) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t output_idx = idx; - int64_t new_dim_idx = 0; - -#pragma unroll - for (size_t i = 0; i < Rank; i++) { - new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; - if (new_dim_idx >= sizes[i]) { - output_idx += (shifts[i] - sizes[i]) * strides[i]; - } else { - output_idx += shifts[i] * strides[i]; - } - } - output[output_idx] = input[idx]; -} - -template -class RollKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = (shifts[0] % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - - if (size != 0) { - shifts[i] = (shifts[i] % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - -#define CALL_ROLL_CUDA_KERNEL(N) \ - case N: { \ - phi::Array _strides; \ - phi::Array _shifts; \ - phi::Array _sizes; \ - for (size_t idx = 0; idx < N; ++idx) { \ - _strides[idx] = strides[idx]; \ - _shifts[idx] = shifts[idx]; \ - _sizes[idx] = sizes[idx]; \ - } \ - RollCudaKernel< \ - T, \ - N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \ - _shifts, _strides, _sizes); \ - break; \ - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -template -class RollGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = ((-shifts[0]) % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - if (size != 0) { - shifts[i] = ((-shifts[i]) % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CUDA_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h deleted file mode 100644 index 413c7bcfc15eb1cae86c3fedf47ea4f677d1248c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roll_op.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, - int64_t shift) { - if (dim < 0) { - dim += input_dim.size(); - } - if (input_dim[dim] == 0) { - return; - } - shift = shift % input_dim[dim]; - if (shift < 0) { - shift += input_dim[dim]; - } - - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_dim[i]; - } - auto slice_width = 1; - for (auto i = dim + 1; i < input_dim.size(); i++) { - slice_width *= input_dim[i]; - } - - VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim - << "; dim: " << dim << "; shift: " << shift - << "; outer_loops: " << outer_loops - << "; slice_width: " << slice_width; - if (shift == 0) { - return; - } - - std::vector head; - auto head_size = slice_width * (input_dim[dim] - shift); - head.resize(head_size); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < head_size; j++) { - head[j] = data[i * input_dim[dim] * slice_width + j]; - } - for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { - auto dst_pos = j - input_dim[dim] + shift; - for (auto k = 0; k < slice_width; k++) { - data[(i * input_dim[dim] + dst_pos) * slice_width + k] = - data[(i * input_dim[dim] + j) * slice_width + k]; - } - } - for (auto j = 0; j < head_size; j++) { - data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; - } - } -} - -template -class RollKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar("X"); - auto* output_var = context.OutputVar("Out"); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - PADDLE_ENFORCE_EQ( - dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(axis[%d]) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", - i, input_dim.size(), input_dim.size() - 1, i, dims[i])); - shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -template -class RollGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar(framework::GradVarName("Out")); - auto* output_var = context.OutputVar(framework::GradVarName("X")); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index e2c8359beb1290f7b1b592c1ff24b15986f41f73..9001ce5d51dece5c6cee481f3f6f92e69c302c2b 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of get_shape op should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of get_shape op should not be null.")); - auto in_dim = ctx->GetInputDim("Input"); - ctx->SetOutputDim("Out", {in_dim.size()}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -89,7 +81,12 @@ Return the shape of the input. namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor, + PD_INFER_META(phi::ShapeInferMeta)); + REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ShapeInferShapeFunctor); diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 3e943c62e1ce17857e78e140efeb50e627e80a4e..c8010e8a128e0b2483c93ed38047b17060bfb0e9 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker, ops::TrilTriuGradOpMaker); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); -REGISTER_OP_CPU_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CPU_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu deleted file mode 100644 index 9cbbdeeb2ce28453f2c22d063975fa82aae5d3b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tril_triu_op.cu +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tril_triu_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CUDA_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h deleted file mode 100644 index 3150b7617d10a8f9c2f60dd2e74ab2cbbb2d655e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tril_triu_op.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuCompute { - public: - HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower, - const int64_t H, const int64_t W, T* out) - : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} - - HOSTDEVICE void operator()(int64_t idx) { - const int64_t row = (idx / W_) % H_; - const int64_t col = idx % W_; - const bool mask = - lower_ ? (col - row > diagonal_) : (col - row < diagonal_); - out_[idx] = mask ? static_cast(0) : in_[idx]; - } - - private: - const T* in_; - const int diagonal_; - const bool lower_; - const int64_t H_; - const int64_t W_; - T* out_; -}; - -template -class TrilTriuOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - const auto* x_data = x->data(); - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = x->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(x->numel())); - - paddle::operators::TrilTriuCompute tril_triu_computer( - x_data, diagonal, lower, H, W, out_data); - for_range(tril_triu_computer); - } -}; - -template -class TrilTriuGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* d_out = - context.Input(framework::GradVarName("Out")); - const auto* dout_data = d_out->data(); - auto* d_x = context.Output(framework::GradVarName("X")); - auto* dx_data = d_x->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = d_out->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(d_out->numel())); - - paddle::operators::TrilTriuCompute tril_triu_grad_computer( - dout_data, diagonal, lower, H, W, dx_data); - for_range(tril_triu_grad_computer); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index ad1c1814c05cdf7f96a6f3c05a5cf1a00d2a2e93..4145730357d6007368d26c46d2b1bd47c9085982 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc index e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb..a44ea8ff689b85d9f718572c45b4f8fafaf1565d 100644 --- a/paddle/fluid/operators/tril_triu_op_xpu.cc +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -11,7 +11,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index ce2dba4db02a00398965f0ed656fa64e4fc828df..4001fd744e67784d736cb157743e4b4d7fa4517e 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT } } +inline void RegisterModelLayout( + std::vector>& ops, + const platform::Place& place) { + if (platform::is_cpu_place(place)) { + auto check_attrib = [](std::unique_ptr& op, + const std::string& attrib_name) -> bool { + if (op->HasAttr(attrib_name)) { + auto data_format = op->Attr(attrib_name); + platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( + data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC + : framework::DataLayout::kNCHW); + return true; + } else { + return false; + } + }; + + for (auto& op : ops) { + if (check_attrib(op, std::string("data_format"))) { + return; + } + if (check_attrib(op, std::string("data_layout"))) { + return; + } + } + } +} + inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) { return (op->GetAttrIfExists("mkldnn_data_type") == "int8" || op->GetAttrIfExists("use_quantizer")); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 9b373a58181f165b52fe809b817977a076b7d961..85427a8455b54ddb8dcfd453ba3d16684729a9d5 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -52,11 +52,13 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/compat/arg_map_context.h" +#include "paddle/phi/core/compat/type_defs.h" namespace paddle { namespace pybind { @@ -436,6 +438,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( return result; } +paddle::imperative::NameTensorMap ConvertToNameTensorMap( + const PyNameVarBaseMap &map) { + paddle::imperative::NameTensorMap result; + for (auto &pair : map) { + auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0); + if (!var_vec.empty()) { + // change vector -> vector> + std::vector> dst_var_vec; + for (auto &v : var_vec) { + dst_var_vec.emplace_back( + std::make_shared(std::move(v))); + } + result.emplace(pair.first, std::move(dst_var_vec)); + } + } + + PADDLE_ENFORCE_EQ( + PyErr_Occurred(), nullptr, + platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred())))); + return result; +} + template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT @@ -2079,8 +2103,8 @@ void BindImperative(py::module *m_ptr) { const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, framework::AttributeMap attrs) { // TODO(xiongkun): move this function outside of tracer. - auto ins_map = ConvertToNameVarBaseMap(ins); - auto outs_map = ConvertToNameVarBaseMap(outs); + auto ins_map = ConvertToNameTensorMap(ins); + auto outs_map = ConvertToNameTensorMap(outs); { auto to_vector = [](paddle::SmallVector &vec) { return std::vector(vec.begin(), vec.end()); diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index bb45c1c40603f953c70f0e63b6e762037312e8c3..ecbacd37d5666b85d5ddaef595d106e2400b055c 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -143,6 +143,7 @@ void BindNode(py::module *m) { .def("var", &Node::Var, return_value_policy::reference) .def("op", &Node::Op, return_value_policy::reference) .def("id", &Node::id) + .def("graph_id", &Node::GraphId) .def("original_desc_id", &Node::OriginalDescId) .def("is_op", &Node::IsOp) .def("is_var", &Node::IsVar) diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 9e86e3df8a6884ec1b75b8525ad858ff8f2e233c..d8750c1d6c115a6de8a493cac4ccadbd47bc10fd 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -88,6 +88,7 @@ std::map> op_ins_map = { {"nce", {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", "CustomDistAlias", "CustomDistAliasProbs"}}, + {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 21bbc7f3e369bf66935487d3f3619c9a0890399b..ed42d0792eafbc8661883a7e8d5b396fac14686f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -114,6 +114,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" @@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) { // stored in this static instance to avoid illegal memory access. m.def("clear_kernel_factory", []() { phi::KernelFactory::Instance().kernels().clear(); }); + m.def("clear_device_manager", []() { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::Clear(); +#endif + }); // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 42de08ebc41938c40675435d4af10f758c52052b..3a1b45d3a20a1e3ff6698f37e412837fcb064f7c 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return LoDTensorType::get( parser.getContext(), shape, elementType, lod_level); } + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } if (keyword == "dense_tensor") { // parse DenseTensor, for example: !i=Infrt.tensor llvm::StringRef target; @@ -134,6 +137,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return DenseTensorType::get( parser.getContext(), *targetType, *precisionType, *layoutType); } + + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } // Todo: parse other type return mlir::Type(); } @@ -154,9 +161,13 @@ void InfrtDialect::printType(::mlir::Type type, << lod_tensor_type.getLod_level() << ">"; return; } + if (type.isa()) { + os << "dense_tensor_map"; + return; + } // print DenseTensorType, for example: !infrt.dense_tensor - if (type.isa()) { + if (type.isa()) { auto dense_tensor_type = type.cast(); os << "dense_tensor<" << dense_tensor_type.getTarget() << ", " << dense_tensor_type.getPrecision() << ", " @@ -164,6 +175,12 @@ void InfrtDialect::printType(::mlir::Type type, return; } + // print DenseTensorType, for example: !infrt.dense_tensor + if (type.isa()) { + os << "dense_tensor_map"; + return; + } + llvm_unreachable("unknown infrt type."); } diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td index f5430b03d0d75cfa8ba91f03ebc90ee0f73c25d7..82eba2a1746cce31e3fe99ae71c782bb88524930 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td @@ -53,9 +53,9 @@ def Infrt_CallOp : Infrt_Op<"call"> { }]; } -def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { - let summary = "convert tensor type op"; - let description = [{convert tensor type op!}]; +def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> { + let summary = "cast tensor type op"; + let description = [{cast tensor type op!}]; let arguments = (ins AnyType:$input); let results = (outs AnyType:$output); } diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index 51addb4deb43824965806962613d1ab4bd1c1e3d..7ae0bbae6275fdac1ea9e98084c866aa438ecce4 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -5,17 +5,17 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" include "paddle/infrt/dialect/pd_ops.td" -def FuseCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)), - (Infrt_CvtTensorOp $arg)>; +def FuseTensorCastPattern : Pat< + (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)), + (Infrt_TensorCastOp $arg)>; -def FuseFeedCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (PD_FeedOp $name)), +def FuseFeedTensorCastPattern : Pat< + (Infrt_TensorCastOp (PD_FeedOp $name)), (PD_FeedOp $name)>; def TypesAreIdentical : Constraint>; -def RedundantCvtTensorOptPattern : Pat< - (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg), +def RedundantTensorCastOptPattern : Pat< + (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg), [(TypesAreIdentical $res, $arg)]>; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 25ecf2ae99dc3613944fcedaee427b540f0faae4..9d8ce5d8dfe399d06bfbe2f0c4b6457a8b3d61f1 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -27,8 +27,12 @@ struct InfrtOpFusePass : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "infrtOpFusePass"; } + + llvm::StringRef getArgument() const override { return "infrt-op-fuse"; } + void runOnFunction() override; }; + // Implementation of the InfrtOpFusePass. void InfrtOpFusePass::runOnFunction() { ::mlir::RewritePatternSet patterns(&getContext()); @@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() { if (nullptr == terminator_op) return; for (auto operand : terminator_op->getOperands()) { auto *op1 = operand.getDefiningOp(); - auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1); + auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1); if (!cvt_op) continue; mlir::Value value = cvt_op.input(); operand.replaceAllUsesWith(value); cvt_op.erase(); } } + } // namespace + std::unique_ptr infrt::createInfrtOpFusePass() { return std::make_unique(); } + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 4e73a533d99a79168b3e68b88d917f48ec811444..67f6bb8a2d7bbfa604614e4909169c08ea18e1b3 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -5,9 +5,6 @@ endif() add_subdirectory(ir) add_subdirectory(pass) -add_executable(phi-ir-exec phi_ir_exec.cc) -target_link_libraries(phi-ir-exec infrt) - add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index d8095d7f3f13fcfbf9b2ccab6db182850633d632..f91381fe729034b3e2d36068dce43d531bfedc1c 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -29,6 +29,7 @@ namespace infrt { namespace phi { void PHIDialect::initialize() { + LOG(INFO) << "PHI Dialect initalized"; addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt index 5c55a6b0acaed7be9ee86b4662d895d08ca05bdc..dc60ecf63fe2eaeffc11ac932e594274c01f8580 100644 --- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt @@ -2,6 +2,8 @@ core_gather_headers() gather_srcs(infrt_src SRCS proto_arg_map_context.cc - phi_op_cvt_pass.cc + phi_op_convert_pass.cc kernel_op_desc.cc - ) + ) + +cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt) diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 353b1054e71374987207e1055289258915e0774d..a26e8e2dca57081d9935883bbe0f01188abf1f1b 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -73,7 +73,7 @@ std::string getPhiLayoutSuffix(LayoutType layout) { } } -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces) { std::vector candidate_kernels; PhiKernelDesc phi_kernel_desc; @@ -88,19 +88,20 @@ std::vector getCandidateKernels( if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; place.layout = LayoutType::ANY; } - phi_kernel_desc.kernelType = place; - phi_kernel_desc.inputsType.clear(); - phi_kernel_desc.outputsType.clear(); + phi_kernel_desc.kernel_type = place; + phi_kernel_desc.input_types.clear(); + phi_kernel_desc.output_types.clear(); phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def(); const paddle::SmallVector& input_arg = args_def.input_defs(); const paddle::SmallVector& output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { - phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } for (auto tensor_arg : output_arg) { - phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.output_types.emplace_back( + ConvertPlaceFromPhi(tensor_arg)); } candidate_kernels.emplace_back(phi_kernel_desc); } diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index b1f7c6c0811def9141e8012518fff5f504934149..cdc8f7cbff553687bed63d165b18c4bc8efdb807 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -21,16 +21,16 @@ namespace infrt { struct PhiKernelDesc { - std::vector inputsType; // kernel input place - std::vector outputsType; // kernel output place - Place kernelType; // kernel place + std::vector input_types; // kernel input place + std::vector output_types; // kernel output place + Place kernel_type; // kernel place }; std::string getPhiTargetPrefix(TargetType target); std::string getPhiPrecisionSuffix(PrecisionType precision); std::string getPhiLayoutSuffix(LayoutType layout); -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd5f0799a60d5d3925e1e1265997820c37b438e6 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include "paddle/phi/kernels/declarations.h" + +namespace infrt { + +TEST(phi, get_op_desc) { + std::vector places; + places.emplace_back( + TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW); + auto kernels = GetCandidateKernels("addmm", places); + ASSERT_GE(kernels.size(), 1UL); +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc similarity index 62% rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index 485bf2a75d890aa0df5e888e7284ae3451aa514c..f9e124aba6c28695f2c0fafa91404d2d10db8eea 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include #include @@ -24,35 +24,52 @@ #include #include +#include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/ops/compat/signatures.h" namespace { -class phiOpCvtPass - : public mlir::PassWrapper { +class PhiOpConvertPass + : public mlir::PassWrapper { public: - ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } + ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; } void runOnFunction() override; - explicit phiOpCvtPass( - std::vector valid_places = std::vector()) + PhiOpConvertPass(); + explicit PhiOpConvertPass(const std::vector &valid_places) : valid_places_(valid_places) {} + PhiOpConvertPass(const PhiOpConvertPass &other) + : mlir::PassWrapper(*this), + valid_places_(other.valid_places_) {} + + ::llvm::StringRef getArgument() const override { return "phi-op-convert"; } + void getDependentDialects(mlir::DialectRegistry ®istry) const override; + private: void convertStage(); - void diapatchStage(); + void dispatchStage(); + + // Force a specified data format for all layout sensitive operations. + Option valid_places_options_{ + *this, + "valid-targets", + llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")}; + std::vector valid_places_; }; - -// Implementation of the phiOpCvtPass. -void phiOpCvtPass::runOnFunction() { +// Implementation of the PhiOpConvertPass. +void PhiOpConvertPass::runOnFunction() { convertStage(); - diapatchStage(); + dispatchStage(); } -void phiOpCvtPass::convertStage() { + +void PhiOpConvertPass::convertStage() { mlir::Block &body = getFunction().front(); std::vector worklist; for (auto &op : body.without_terminator()) { @@ -62,9 +79,9 @@ void phiOpCvtPass::convertStage() { while (!worklist.empty()) { auto *op = worklist.back(); worklist.pop_back(); - if (op == nullptr) continue; + if (!op) continue; - std::string op_name = op->getName().getIdentifier().str(); + auto op_name = op->getName().getIdentifier().str(); // only convert op in pd dialect. if (op_name.substr(0, 3) != "pd.") continue; @@ -73,6 +90,7 @@ void phiOpCvtPass::convertStage() { pd_dialect_inputs_info_map_.end() || pd_dialect_outputs_info_map_.find(op_name) == pd_dialect_outputs_info_map_.end()) { + LOG(WARNING) << "No op info found for " << op_name; // Todo: print log continue; } @@ -85,7 +103,8 @@ void phiOpCvtPass::convertStage() { ::llvm::SmallVector output_types; for (const std::string &str : std::get<0>(kernel_sign.args)) { if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log + LOG(ERROR) << "No input info for Op " << op_name << " and argument " + << str; return; } uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); @@ -94,7 +113,8 @@ void phiOpCvtPass::convertStage() { for (const std::string &str : std::get<2>(kernel_sign.args)) { if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log + LOG(ERROR) << "No output info for Op " << op_name << " and argument " + << str; return; } uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); @@ -109,14 +129,13 @@ void phiOpCvtPass::convertStage() { for (size_t index = 0; index < ori_output.size(); ++index) { ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - if (!op->use_empty()) { - // Todo: print error log - return; - } + + CHECK(op->use_empty()); op->erase(); } } -void phiOpCvtPass::diapatchStage() { + +void PhiOpConvertPass::dispatchStage() { std::vector worklist; mlir::Block &block = getFunction().front(); for (auto &op : block) { @@ -129,7 +148,7 @@ void phiOpCvtPass::diapatchStage() { for (infrt::KernelOp kernel_op : worklist) { std::string kernel_name = kernel_op.name().str(); std::vector candidates = - getCandidateKernels(kernel_name, valid_places_); + GetCandidateKernels(kernel_name, valid_places_); if (candidates.empty()) { LOG(FATAL) << "No candidate kernels for op:" << kernel_name; continue; @@ -140,17 +159,17 @@ void phiOpCvtPass::diapatchStage() { const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); kernel_name = - infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) + kernel_name + - infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + - infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); + infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) + + infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout); mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); - if (phi_context.find(phi_kernel_desc.kernelType.target) == + if (phi_context.find(phi_kernel_desc.kernel_type.target) == phi_context.end()) { - switch (phi_kernel_desc.kernelType.target) { + switch (phi_kernel_desc.kernel_type.target) { case infrt::TargetType::CPU: { auto context_value = builder @@ -169,33 +188,36 @@ void phiOpCvtPass::diapatchStage() { } } operation_state.addOperands( - phi_context.at(phi_kernel_desc.kernelType.target)); - for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { + phi_context.at(phi_kernel_desc.kernel_type.target)); + + for (size_t index = 0; index < phi_kernel_desc.input_types.size(); + ++index) { mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), infrt::DenseTensorType::get( kernel_op.getContext(), - phi_kernel_desc.inputsType[index].target, - phi_kernel_desc.inputsType[index].precision, - phi_kernel_desc.inputsType[index].layout), + phi_kernel_desc.input_types[index].target, + phi_kernel_desc.input_types[index].precision, + phi_kernel_desc.input_types[index].layout), input); operation_state.addOperands(cvt_tensor_type_op.output()); } - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); ++index) { operation_state.addTypes(infrt::DenseTensorType::get( kernel_op.getContext(), - phi_kernel_desc.outputsType[index].target, - phi_kernel_desc.outputsType[index].precision, - phi_kernel_desc.outputsType[index].layout)); + phi_kernel_desc.output_types[index].target, + phi_kernel_desc.output_types[index].precision, + phi_kernel_desc.output_types[index].layout)); } operation_state.addAttributes(kernel_op.attrsAttr().getValue()); mlir::Operation *phi_operation = builder.createOperation(operation_state); - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); ++index) { mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); kernel_op.getResult(index).replaceAllUsesWith( cvt_tensor_type_op.output()); @@ -204,9 +226,35 @@ void phiOpCvtPass::diapatchStage() { } } +PhiOpConvertPass::PhiOpConvertPass() { + if (!valid_places_options_.hasValue()) { + valid_places_.emplace_back(infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); + return; + } + + LOG(FATAL) << "To be done for specifying places in command line"; +} + +void PhiOpConvertPass::getDependentDialects( + mlir::DialectRegistry ®istry) const { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); +} + } // namespace +mlir::PassRegistration phi_op_convert; + std::unique_ptr infrt::createPhiOpCvtPass( std::vector valid_places) { - return std::make_unique(valid_places); + return std::make_unique(valid_places); +} + +std::unique_ptr infrt::createPhiOpCvtPass() { + return std::make_unique(); } diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h similarity index 86% rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h index 8b1944042aa7c42fef87786af0d0fa131c6f0535..5a2c0ee96ed0de120f8667d8f2fb91314c02e9ac 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h @@ -21,7 +21,8 @@ namespace infrt { * phiOpCvtPass. * Convert the general operators from pd Dialect to phi dialect. */ -std::unique_ptr createPhiOpCvtPass( - std::vector valid_places = std::vector()); +std::unique_ptr createPhiOpCvtPass(std::vector valid_places); + +std::unique_ptr createPhiOpCvtPass(); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index de61dba8e744c88f279761520ac1815bb265d875..0beb5bff29f6df73be75a18611a5207bb1e3aad7 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -18,7 +18,7 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" int main(int argc, char** argv) { static llvm::cl::opt input_file( diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 1506282f6268191a2eece5540d30fbe90d8eeb52..319df90d3eec133d3f02be6749e9ad379fd225fd 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -30,7 +30,7 @@ #include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" #endif diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 18c25827b8ec5a71907e694cea4e7680b598e883..96aecb755898466ee9530cb3bdc894e51d074cd7 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -13,15 +13,17 @@ // limitations under the License. #include "paddle/infrt/host_context/paddle_mlir.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { - context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); + context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } @@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( UpdateModelParams(program, &mainFunc); UpdateModelOps(program); UpdateModelOutputs(program); - return module_; } @@ -171,7 +172,11 @@ void MLIRModelGenImpl::UpdateModelParams( ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), builder_, &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -197,8 +202,9 @@ void MLIRModelGenImpl::UpdateModelOutputs( llvm::SmallVector resultTypes; llvm::SmallVector attrs; + mlir::OperationState state(loc, - mlir::ReturnOp::getOperationName(), + ::infrt::ReturnOp::getOperationName(), operands, resultTypes, attrs); @@ -321,7 +327,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( switch (type) { ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr); ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr); - ATTR_IMPL_CASE(INT, i, getI32IntegerAttr); + ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr); ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr); ATTR_IMPL_CASE(STRING, s, getStringAttr); diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index 5ce6d8673421ba3c53c9dad6d2fd1f20298f837a..58543a6864258bd6c0153150bb535262d9a8f00d 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,6 +1,8 @@ +cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS}) + configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" - DEPENDS infrtopt infrtexec phi-ir-exec) + DEPENDS infrtopt infrtexec) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir index 61a66cb3d71a372bcd67cb96362abcb033768e4d..47badd97d37db578ec36f496b21212d73fd9920e 100644 --- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir @@ -1,4 +1,5 @@ -// RUN: phi-ir-exec %s +// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s + // CHECK-LABEL: @ops func @ops() { %a = pd.feed() {name="input0"} : !infrt.lod_tensor @@ -8,3 +9,10 @@ func @ops() { %h = "pd.abs"(%g):(tensor) -> tensor "pd.fetch"(%h) {name="output"} :(tensor)->() } + +// CHECK-LABEL: @op_execute +func @op_execute(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor, %c:!infrt.lod_tensor) -> !infrt.lod_tensor { + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor + %h = "pd.abs"(%g):(tensor) -> tensor + "pd.fetch"(%h) {name="output"} :(tensor)->() +} diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py new file mode 100644 index 0000000000000000000000000000000000000000..dd1632bc9d4d8e4e6ea0fb918d1179f4e28a441b --- /dev/null +++ b/paddle/infrt/tests/model/abs_model.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn import Layer +from paddle.static import InputSpec +from paddle.jit import to_static +import sys + + +class AbsNet(paddle.nn.Layer): + def __init__(self): + super(AbsNet, self).__init__() + + def forward(self, x): + x = paddle.abs(x) + return x + + +if __name__ == '__main__': + # build network + model = AbsNet() + # save inferencing format model + net = to_static( + model, input_spec=[InputSpec( + shape=[None, 1, 28, 28], name='x')]) + paddle.jit.save(net, sys.argv[1]) diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc new file mode 100644 index 0000000000000000000000000000000000000000..5de159b86fce29f774b07770aaaee0c1b6aebd31 --- /dev/null +++ b/paddle/infrt/tests/model/test_abs.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "llvm/Support/DynamicLibrary.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/registry.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" +#include "paddle/infrt/host_context/paddle_mlir.h" + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + +static llvm::cl::list cl_shared_libs( // NOLINT + "shared_libs", + llvm::cl::desc("Specify shared library with kernels."), + llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated); + +TEST(ABS_MODEL, convert_and_execute) { + std::string model_file_name = "./abs.pdmodel"; + std::string params_file_name = "./abs.pdiparams"; + // convert model + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); + // pick kernel + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + context->allowUnregisteredDialects(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->loadAllAvailableDialects(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); + + if (mlir::failed(pm.run(module_))) { + std::cout << "\npass failed!\n" << std::endl; + } + module_.dump(); + + // executate + infrt::host_context::KernelRegistry registry; + infrt::kernel::RegisterBasicKernels(®istry); + infrt::kernel::RegisterTestKernels(®istry); + infrt::kernel::RegisterTensorShapeKernels(®istry); + infrt::kernel::RegisterTensorKernels(®istry); + infrt::kernel::RegisterControlFlowKernels(®istry); + infrt::kernel::RegisterPhiKernels(®istry); + infrt::kernel::RegisterInferShapeLaunchers(®istry); + // load extra shared library + for (const auto& lib_path : cl_shared_libs) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + break; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = + reinterpret_cast( + reg_sym); + reg_func(®istry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + infrt::host_context::TestMlir(module_, ®istry); +} diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 7b074d0ebb76d110dc361140bd42f78ef54f224b..04e1bbcc9df423bc38e78822ec6ef8ee28c5b216 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -25,8 +25,6 @@ add_subdirectory(tests) # make an unity target for compile deps set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) -# keep this message for debug, remove it later if needless -message(STATUS "All standard phi kernels: ${phi_kernels}") set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) cc_library(phi DEPS ${PHI_DEPS}) diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 1ffe38d8e1f4ce59aa819a5eaa46c75d5fded5b0..35339aed0f3e1cd87ac65855e0255fa3277a6bfb 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() { return platform_manager; } +void DeviceManager::Clear() { + Instance().device_map_.clear(); + Instance().device_impl_map_.clear(); +} + std::vector ListAllLibraries(const std::string& library_dir) { std::vector libraries; std::regex express(".*\\.so"); diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index c0911a0f8d50c52697b748f3726faded5a428694..39eef27b4a607bd3af75a6b5dde07f715e5537e5 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -158,6 +158,8 @@ class DeviceManager { static std::vector GetDeviceList(const std::string& device_type); + static void Clear(); + private: DISABLE_COPY_AND_ASSIGN(DeviceManager); DeviceManager() {} diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 06ee5a205d7b0f2f842e1b9b4b8fad8948168b64..260fbfe7197912fd3dd5b9103a0a991a45d55816 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -21,6 +21,10 @@ limitations under the License. */ namespace phi { +// Common InferMeta Functions for backward operators. +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + void BilinearTensorProductGradInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index ff2cf81a904e0a7a47b7ca44fdc3918cbdac902c..38dce0dc69d317d95541f3f10ba8018b03b9d6b5 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_target = label.dims(); + PADDLE_ENFORCE_EQ(dim_x.size(), + dim_target.size(), + phi::errors::InvalidArgument( + "Input(X) rank and Input(Target) rank should be " + "same, but received X rank(%d) != Target rank(%d)", + dim_x.size(), + dim_target.size())); + for (int i = 0; i < dim_x.size(); i++) { + if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) { + PADDLE_ENFORCE_EQ( + dim_x[i], + dim_target[i], + phi::errors::InvalidArgument( + "Input(X) and Input(Target) should in same shape. but received " + "X dimension[%d](%d) != Target dimension[%d](%d)", + i, + dim_x[i], + i, + dim_target[i])); + } + } + + auto reduction_valid = "mean" == reduction || "sum" == reduction || + "batchmean" == reduction || "none" == reduction; + PADDLE_ENFORCE_EQ( + reduction_valid, + true, + phi::errors::InvalidArgument( + "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); + + if ("none" == reduction) { + out->set_dims(dim_x); + } else { + out->set_dims({1}); + } + out->set_dtype(x.dtype()); +} + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->share_meta(x); } @@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x, out->share_lod(x); } +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out) { + auto index_dims = index.dims(); + + if (index_dims.size() == 2) { + PADDLE_ENFORCE_EQ( + index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of index should be 1 when it is 2D, but we get %d", + index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The index should be 1D, when it is not 2D, but we get %d", + index_dims.size())); + } + + auto input_dim = x.dims(); + auto axis_v = axis.to(); + if (axis.FromTensor() || axis_v == 0) { + // if axis.FromTensor(), we can not obtain correct shape of output + int batch_size = index_dims[0]; + phi::DDim output_dims(input_dim); + output_dims[0] = batch_size; + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } else { + int index_size = index_dims[0]; + std::vector out_dim_vec; + for (int i = 0; i < axis_v; i++) { + out_dim_vec.push_back(input_dim[i]); + } + out_dim_vec.push_back(index_size); + for (int i = axis_v + 1; i < input_dim.size(); i++) { + out_dim_vec.push_back(input_dim[i]); + } + auto output_dims = phi::make_ddim(out_dim_vec); + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } +} + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out) { @@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x, out->share_lod(y); } +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output) { + auto input_dim = x.dims(); + auto index_dim = index.dims(); + + PADDLE_ENFORCE_EQ( + dim < input_dim.size() && dim >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + input_dim.size(), + input_dim.size() - 1, + dim)); + + PADDLE_ENFORCE_EQ( + index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), + true, + phi::errors::InvalidArgument( + "The 'shape' of Input(Index) must be 1-D tensor. " + "But received: the 'shape' of Input(Index) is [%s], " + "the dimension of Input(Index) is [%d].", + index_dim, + index_dim.size())); + + PADDLE_ENFORCE_EQ( + index_dim[0] != 0, + true, + phi::errors::InvalidArgument("The length of Input(Index) can't be 0.")); + + auto output_dim = phi::vectorize(input_dim); + if (dim < 0) { + dim += input_dim.size(); + } + output_dim[dim] = index_dim[0]; + output->set_dims(phi::make_ddim(output_dim)); + output->set_dtype(x.dtype()); + output->set_layout(x.layout()); + output->share_lod(x); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x, out->share_lod(y); } +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + detail::BinarySameInputDimsCheck(x, y, config); + + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cfae45cf04b87c287a174d172700a794c8c2a2a3..8cf7ce3930e941a3c5243306fa38e4466059509a 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -28,12 +29,20 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void AllValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, MetaConfig config = MetaConfig()); +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void BCELossInferMeta(const MetaTensor& input, @@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out); + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out); @@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output); + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x, bool unitriangular, MetaTensor* out); +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 84441ed8b740be172ddaa7de3fc23ad420ebf077..ef75ab573c6d9bd5c65fad747a28f2c704257371 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -369,6 +369,79 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { + auto inputs_dims = GetMetaTensorsDim(x); + + const size_t inputs_num = inputs_dims.size(); + PADDLE_ENFORCE_GT( + inputs_num, + static_cast(1), + phi::errors::InvalidArgument( + "The number of input tensors in multi_dot op should > 1.")); + + const size_t n = inputs_dims.size(); + auto first_dim = inputs_dims[0]; + + bool is_vector = false; + phi::DDim out_dim; + + PADDLE_ENFORCE_LT( + first_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the first tensor is 1D of size n view it as a row vector (1, n) + if (first_dim.size() == 1) { + first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); + is_vector = true; + } + + auto last_dim = inputs_dims[n - 1]; + PADDLE_ENFORCE_LT( + last_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the last tensor is 1D of size n view it as a column vector (n, 1) + if (last_dim.size() == 1) { + last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); + out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); + } else { + out_dim = is_vector ? phi::make_ddim({last_dim[1]}) + : phi::make_ddim({first_dim[0], last_dim[1]}); + } + + auto width = first_dim[1]; + for (size_t i = 1; i < n - 1; i++) { + PADDLE_ENFORCE_EQ(inputs_dims[i].size(), + static_cast(2), + phi::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + + const auto& tmp_dim = inputs_dims[i]; + PADDLE_ENFORCE_EQ( + tmp_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + width = tmp_dim[1]; + } + + PADDLE_ENFORCE_EQ( + last_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + + out->set_dims(out_dim); + out->set_dtype(x.at(0)->dtype()); + out->share_lod(*x.at(0)); +} + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c11843212ed33fd8170e6677a4d6e0ad95b730dc..6de95386dd998810b508db6d0469691a37cd53dd 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,23 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +// Common InferMeta Functions for multiary operators, The format like: +// +// 1. The number of input MetaTensor is more than 3: +// void [FunctionDesc|OpName]InferMeta(const MetaTensor& x, +// const MetaTensor& y, +// const MetaTensor& z, +// const MetaTensor& w, +// ..., +// MetaTensor* out) {} +// +// 2. There are `const vector&` in params: +// void [FunctionDesc|OpName]InferMeta(const vector& x, +// ..., +// MetaTensor* out) {} +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + std::vector GetMetaTensorsDim(const std::vector& tensors); void AdadeltaInferMeta(const MetaTensor& param, @@ -70,6 +87,8 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void MultiDotInferMeta(const std::vector& x, MetaTensor* out); + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 38eaa636f8c8779c5a1f597b8cfb23ce6efc5edc..55e59b27e71cfb1d9b16a659e40d299ed3f2fc54 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -27,6 +27,8 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 235cfe368c1921eac546b670470963fb49100290..837750710c9a3dcf3c8b414c5c52a7272a0b3f58 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input, total_weight->set_dtype(input.dtype()); } +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The size of RoisNum should be 1" + ", but received size = %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The format of Input(X) in" + "RoIAlignOp is NCHW. And the rank of input must be 4. " + "But received rank = %d", + input_dims.size())); + PADDLE_ENFORCE_EQ(boxes_dims.size(), + 2, + phi::errors::InvalidArgument("The rank of Input(ROIs) " + "in RoIAlignOp should be 2. " + "But the rank of RoIs is %d", + boxes_dims.size())); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "The second dimension " + "of Input(ROIs) should be 4. But received the " + "dimension = %d", + boxes_dims[1])); + } + + PADDLE_ENFORCE_GT(pooled_height, + 0, + phi::errors::InvalidArgument( + "The 'pooled_height' attribute in RoIAlignOp is " + "invalid. The height must be greater than 0. But " + "received 'pooled_height' = %d", + pooled_height)); + PADDLE_ENFORCE_GT(pooled_width, + 0, + phi::errors::InvalidArgument( + "The 'pooled_width' attribute in RoIAlignOp is " + "invalid. The width must be greater than 0. But " + "received 'pooled_width' = %d", + pooled_width)); + PADDLE_ENFORCE_GT(spatial_scale, + 0.0f, + phi::errors::InvalidArgument( + "The 'spatial_scale' attribute in RoIAlignOp is " + "invalid. The scale must be greater than 0. But " + "received 'spatial_scale' = %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 209a07db18b5c7a87ba094c5839149533757220d..0e7b9cb12a4d0b44727f488412af754e2ba8ad94 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -30,6 +30,8 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. // +// The InferMeta Functions in this file are arranged in alphabetic order. + void AccuracyInferMeta(const MetaTensor& out, const MetaTensor& indice, const MetaTensor& label, @@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input, MetaTensor* total_weight, MetaConfig config = MetaConfig()); +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4d1cb42bd59f072e5926b237528a742c231bcdcf..f81f4a1b7c7390baca82d55abecbc49d0f67c235 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -554,6 +554,28 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { + auto dims = x.dims(); + auto n_dim = dims.size(); + PADDLE_ENFORCE_GE(n_dim, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions. But " + "received a %d dimension tensor.", + n_dim)); + PADDLE_ENFORCE_EQ(dims[n_dim - 2], + dims[n_dim - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + dims[n_dim - 2], + dims[n_dim - 1])); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -626,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); } +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + errors::InvalidArgument( + "the axis of ModeOp must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + errors::InvalidArgument("input of ModeOp must have >= 1d shape")); + if (axis < 0) axis += dim_size; + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + PADDLE_ENFORCE_GE(input_dims.size(), + 1, + errors::InvalidArgument("input shape should >= 1d")); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -994,6 +1059,43 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out) { + auto shifts_data = shifts.GetData(); + + if (axis.size() != 0) { + PADDLE_ENFORCE_EQ( + axis.size(), + shifts_data.size(), + phi::errors::InvalidArgument("When dims.size() != 0, dims.size() " + "should be equal to " + "shifts.size(). But received " + "dims.size() = %d, shifts.size() = %d", + axis.size(), + shifts_data.size())); + } else { + PADDLE_ENFORCE_EQ( + shifts_data.size(), + 1, + phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() " + "should be equal to 1, But received " + "shifts.size() = %d", + shifts_data.size())); + } + + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { + auto in_dim = input.dims(); + out->set_dims(phi::make_ddim({in_dim.size()})); + out->set_dtype(DataType::INT32); +} + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 75fb9fadf82dc87ac18814e0674e5012fea95ec4..eb894003e5354222bff14945cc0e2aeb565b4e3d 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -31,6 +31,8 @@ class MetaConfig; // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void ArgMinMaxInferMeta(const MetaTensor& x, int64_t axis, @@ -98,6 +100,8 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -108,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, MetaTensor* mask, MetaConfig config = MetaConfig()); +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -162,6 +172,13 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out); + +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 761d65ed36c29552569f3c759b57e7b202eaba60..5ddaea6e3c57f11219d6297ea98abf42f63cc7d5 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,7 +27,11 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel + matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel + put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel + softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel + triangular_solve_grad_kernel determinant_grad_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) @@ -46,6 +50,7 @@ kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index afcb87f9b837118e0a61877c54402a52ec282565..dfeb7691240cf67b67acddb3e450ef037a2e1cb7 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -26,6 +26,23 @@ namespace phi { const DenseTensor& dout, \ DenseTensor* dx); +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx); + #define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -33,6 +50,14 @@ namespace phi { const DenseTensor& dout, \ DenseTensor* dx); +#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + template void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, @@ -58,21 +83,6 @@ void TanhTripleGradKernel(const Context& dev_ctx, DenseTensor* d_dout, DenseTensor* d_ddx); -template -void BReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float t_min, - float t_max, - DenseTensor* dx); - -template -void LeakyReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float alpha, - DenseTensor* dx); - template void LeakyReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -81,11 +91,21 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx, DenseTensor* ddout); template -void ThresholdedReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float threshold, - DenseTensor* dx); +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void EluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + float alpha, + DenseTensor* dx, + DenseTensor* ddout); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); @@ -98,8 +118,18 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu); + DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Exp); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold) + + DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max) + } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index be86b1e688c5b904a2654b86669957bd4350a05f..6a445bf9fb7f6bdbdb3f21354662de83aa682922 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -24,6 +24,21 @@ namespace phi { void name##Kernel( \ const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out); + +#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out); + DECLARE_ACTIVATION_KERNEL(Cos) DECLARE_ACTIVATION_KERNEL(Tan) DECLARE_ACTIVATION_KERNEL(Acos) @@ -40,25 +55,16 @@ DECLARE_ACTIVATION_KERNEL(Tanh) DECLARE_ACTIVATION_KERNEL(Exp) DECLARE_ACTIVATION_KERNEL(Expm1) DECLARE_ACTIVATION_KERNEL(Softsign) +DECLARE_ACTIVATION_KERNEL(TanhShrink) +DECLARE_ACTIVATION_KERNEL(Silu) -template -void BReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float t_min, - float t_max, - DenseTensor* out); - -template -void LeakyReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float alpha, - DenseTensor* out); +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) -template -void ThresholdedReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float threshold, - DenseTensor* out); +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) template void LogitKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index d454700076de269124b6b1c6bf1c2f9bc7e25eee..f878aea94f43d26ea3bb7da05b30c2d0d9c5d614 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -21,102 +21,141 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ - name, functor_class, attr1, attr2) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr1, \ - float attr2, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& out, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, nullptr, &out, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); - -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Exp, funcs::ExpGradFunctor); - -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, - funcs::LeakyReluGradFunctor, +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Exp, ExpGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, + LeakyReluGradFunctor, alpha); -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( - ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); - -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, - funcs::BReluGradFunctor, +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, + ThresholdedReluGradFunctor, + threshold); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + SoftShrinkGradFunctor, + lambda); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + HardShrinkGradFunctor, + threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, + BReluGradFunctor, t_min, t_max); +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + + auto x_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad")); + auto out_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad")); + auto dout_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad")); + auto dx_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad")); + auto* place = dev_ctx.eigen_device(); + + if (alpha > 0) { + funcs::ELUGradFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } else { + funcs::ELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } +} + } // namespace phi PD_REGISTER_KERNEL( @@ -145,6 +184,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, ReluDoubleGradKernel) @@ -152,6 +196,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) PD_REGISTER_KERNEL(tanh_triple_grad, CPU, diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 0535686ab1df91f301a831809ce377855bd4e7fb..9816cf118fb07fa4ef8fd0187b38102a3ab31be5 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -20,99 +20,115 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationImpl(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ - name, functor_class, attr1, attr2) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr1, \ - float attr2, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Exp, funcs::ExpFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Expm1, funcs::Expm1Functor) -DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, funcs::ReciprocalFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Square, funcs::SquareFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, funcs::SqrtFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, funcs::RsqrtFunctor) - -DEFINE_CPU_ACTIVATION_KERNEL(Softsign, funcs::SoftsignFunctor) - -DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Exp, funcs::ExpFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Expm1, funcs::Expm1Functor) +DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, funcs::ReciprocalFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Square, funcs::SquareFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, funcs::SqrtFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, funcs::RsqrtFunctor) + +DEFINE_CPU_ACTIVATION_KERNEL(Softsign, funcs::SoftsignFunctor) + +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) + DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, - funcs::ThresholdedReluFunctor, + ThresholdedReluFunctor, threshold) -// DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, funcs::MishFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, funcs::MishFunctor, threshold) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, funcs::STanhFunctor, scale_a, scale_b) -// DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, -// funcs::SoftplusFunctor, -// beta, -// threshold) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, + funcs::SoftplusFunctor, + beta, + threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) + +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) } // namespace phi PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ - PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} - -PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) -PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) -PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) -PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) -PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) -PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) -PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) -PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) -PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) -PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) -PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) -PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) -PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) -PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) -PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) // PD_REGISTER_ACTIVATION_KERNEL(mish, Mish) PD_REGISTER_ACTIVATION_KERNEL(stanh, STanh) PD_REGISTER_ACTIVATION_KERNEL(reciprocal, Reciprocal) diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc similarity index 51% rename from paddle/phi/kernels/gpu/reduce_any_kernel.cu rename to paddle/phi/kernels/cpu/determinant_grad_kernel.cc index 39c8cbe442cbd33db5da3c4311abd68641aafcd7..e57d7263f88bfc7b15910f05ec9fa6b45c213e7e 100644 --- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu +++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc @@ -12,25 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_any_kernel.h" +#include "paddle/phi/kernels/determinant_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" -namespace phi { - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} +PD_REGISTER_KERNEL(determinant_grad, + CPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/cpu/determinant_kernel.cc similarity index 51% rename from paddle/phi/kernels/gpu/reduce_min_kernel.cu rename to paddle/phi/kernels/cpu/determinant_kernel.cc index ba37d54895d0d079a4153775ad80314be5a043ba..5810e88e92527fac2c643b239661c50df32cd6f9 100644 --- a/paddle/phi/kernels/gpu/reduce_min_kernel.cu +++ b/paddle/phi/kernels/cpu/determinant_kernel.cc @@ -12,26 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_min_kernel.h" +#include "paddle/phi/kernels/determinant_kernel.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" PD_REGISTER_KERNEL( - min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0a6948018afce277725c50e3cbb0e17ab495a83 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_grad_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } + return; + } + + dev_ctx.template Alloc(x_grad); + + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (x_grad->numel() == 0) return; + + if (index_type == phi::DataType::INT32) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } else if (index_type == phi::DataType::INT64) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + CPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9207a05b9dcce1daed95a1dbdb99db3c23c5c90d --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) { + return; + } + + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + CPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..923cb8424115e00f07274f959ffe34adaa9a0327 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc @@ -0,0 +1,357 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +static inline void ClipWithMask(const CPUContext& ctx, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode, + DenseTensor* grid_slice, + DenseTensor* grid_scale) { + auto& place = *ctx.eigen_device(); + grid_scale->Resize(grid_slice->dims()); + ctx.Alloc(grid_scale); + + auto grid_slice_t = EigenTensor::From(*grid_slice); + auto factor = static_cast(max_val * 0.5); + if (!align_corners) { + factor = static_cast((max_val + 1) * 0.5); + } + auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); + + if (padding_mode == "border") { + // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); + auto res = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + + auto in_bound = (res == grid_slice_t); + grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); + grid_slice_t.device(place) = res; + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto is_neg = (grid_slice_t < static_cast(0)); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()); + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + auto reflected = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + auto clipped = reflected.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + auto in_bound = (clipped == reflected).template cast(); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()) * + in_bound; + grid_slice_t.device(place) = clipped; + } + } +} + +template +static void CalcGridLocationsWithGrad(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + ClipWithMask( + ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale); + ClipWithMask( + ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale); +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& d1, + const DenseTensor& d2) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto d1_t = EigenTensor::From(d1); + auto d2_t = EigenTensor::From(d2); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); + } + } + } + } + } +} + +template +static void GatherBilinearGrad(const CPUContext& ctx, + const DenseTensor& input, + const DenseTensor& output_grad, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale, + DenseTensor* input_grad, + DenseTensor* grid_grad) { + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, // grid_x + grid_y, // grid_y + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + // gather output grad value to input grad by corner point coords and weight + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); + + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto output_grad_t = EigenTensor::From(output_grad); + + if (grid_grad != nullptr) { + DenseTensor grid_grad_x, grid_grad_y; + grid_grad_x.Resize({n, out_h, out_w}); + grid_grad_y.Resize({n, out_h, out_w}); + ctx.Alloc(&grid_grad_x); + ctx.Alloc(&grid_grad_y); + auto grid_grad_x_t = + EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); + auto grid_grad_y_t = + EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + grid_grad_x_t(i, k, l) += + ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + + (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * + output_grad_t(i, j, k, l); + grid_grad_y_t(i, k, l) += + ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + + (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * + output_grad_t(i, j, k, l); + } + } + } + } + + // const T x_max = static_cast(in_w - 1); + // const T y_max = static_cast(in_h - 1); + + auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); + auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); + grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; + grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; + + // gather grid_grad [x, y] in 3rd Dim + T* grid_grad_data = grid_grad->data(); + T* grid_grad_x_data = grid_grad_x.data(); + T* grid_grad_y_data = grid_grad_y.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_grad_data[2 * i] = grid_grad_x_data[i]; + grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; + } + } +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l); + } + } + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + x_grad->Resize({n, c, in_h, in_w}); + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + if (grid_grad != nullptr) { + grid_grad->Resize({n, out_h, out_w, 2}); + dev_ctx.template Alloc(grid_grad); + phi::funcs::SetConstant()( + dev_ctx, grid_grad, static_cast(0)); + } + + DenseTensor grid_x, grid_y; + DenseTensor grid_x_scale, grid_y_scale; + CalcGridLocationsWithGrad(dev_ctx, + grid, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale); + if (mode == "bilinear") { + GatherBilinearGrad(dev_ctx, + x, + out_grid, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale, + x_grad, + grid_grad); + } else { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GatherOutputGradToInputGrad(out_grid, x_grad, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + CPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..92a528cdda96a191cf73115feb3cf3dd3656305d --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Array4 = Eigen::DSizes; + +template +static inline void Clip(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + if (padding_mode == "border") { + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } + } +} + +template +static void CalcGridLocations(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + Clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); + Clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); +} + +template +static void BilinearInter(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* out) { + auto& place = *ctx.eigen_device(); + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, + grid_y, + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto d_w_scaled_t = + d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_e_scaled_t = + d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_n_scaled_t = + d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_s_scaled_t = + d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + auto output_t = EigenTensor::From(*out); + // bilinear interpolaetion by 4 corner points + output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + + v_en_t * d_w_scaled_t * d_s_scaled_t + + v_ws_t * d_e_scaled_t * d_n_scaled_t + + v_es_t * d_w_scaled_t * d_n_scaled_t; +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + out->Resize(phi::make_ddim({n, c, out_h, out_w})); + dev_ctx.template Alloc(out); + phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); + + DenseTensor grid_x, grid_y; + CalcGridLocations( + dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y); + + if (mode == "bilinear") { + BilinearInter(dev_ctx, x, &grid_x, &grid_y, out); + } else if (mode == "nearest") { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GetGridPointValue(x, out, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..53a16446d7e8c65b3d2d63835e6a2b86c1f96795 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_utils.h @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void Unnormalize(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + + if (!align_corners) { + auto factor = static_cast((max_val + 1) * 0.5); + grid_slice_t.device(place) = + (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); + } else { + auto factor = static_cast(max_val * 0.5); + grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; + } +} + +template +inline bool IsInBound(T x, T y, T x_max, T y_max) { + if (x < 0 || x > x_max || y < 0 || y > y_max) { + return false; + } + return true; +} + +template +void GetGridPointValue(const DenseTensor& input, + DenseTensor* output, + const DenseTensor& x, + const DenseTensor& y) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int in_h = input.dims()[2]; + const int in_w = input.dims()[3]; + const int out_h = x.dims()[1]; + const int out_w = x.dims()[2]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto output_t = EigenTensor::From(*output).setConstant((T)0); + auto input_t = EigenTensor::From(input); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + output_t(i, j, k, l) = + input_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))); + } + } + } + } + } +} + +template +void AllNeigbors(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* x_w, + DenseTensor* x_e, + DenseTensor* y_n, + DenseTensor* y_s, // positions + DenseTensor* d_w, + DenseTensor* d_e, + DenseTensor* d_n, + DenseTensor* d_s, // distance + DenseTensor* v_wn, + DenseTensor* v_en, + DenseTensor* v_ws, + DenseTensor* v_es) { // values + auto& place = *ctx.eigen_device(); + + const int c = input.dims()[1]; + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + // calculate coords of 4 corner points + x_w->Resize({n, out_h, out_w}); + x_e->Resize({n, out_h, out_w}); + y_n->Resize({n, out_h, out_w}); + y_s->Resize({n, out_h, out_w}); + ctx.Alloc(x_w); + ctx.Alloc(x_e); + ctx.Alloc(y_n); + ctx.Alloc(y_s); + auto x_w_t = EigenTensor::From(*x_w); + auto x_e_t = EigenTensor::From(*x_e); + auto y_n_t = EigenTensor::From(*y_n); + auto y_s_t = EigenTensor::From(*y_s); + + auto grid_x_t = EigenTensor::From(*grid_x); + auto grid_y_t = EigenTensor::From(*grid_y); + + x_w_t.device(place) = grid_x_t.floor(); + x_e_t.device(place) = x_w_t + static_cast(1); + y_n_t.device(place) = grid_y_t.floor(); + y_s_t.device(place) = y_n_t + static_cast(1); + + // calculate distances to 4 sides + d_w->Resize({n, out_h, out_w}); + d_e->Resize({n, out_h, out_w}); + d_n->Resize({n, out_h, out_w}); + d_s->Resize({n, out_h, out_w}); + ctx.Alloc(d_w); + ctx.Alloc(d_e); + ctx.Alloc(d_n); + ctx.Alloc(d_s); + auto d_w_t = EigenTensor::From(*d_w); + auto d_e_t = EigenTensor::From(*d_e); + auto d_n_t = EigenTensor::From(*d_n); + auto d_s_t = EigenTensor::From(*d_s); + d_w_t.device(place) = grid_x_t - x_w_t; + d_e_t.device(place) = x_e_t - grid_x_t; + d_n_t.device(place) = grid_y_t - y_n_t; + d_s_t.device(place) = y_s_t - grid_y_t; + + // calc 4 corner points value + v_wn->Resize({n, c, out_h, out_w}); + v_en->Resize({n, c, out_h, out_w}); + v_ws->Resize({n, c, out_h, out_w}); + v_es->Resize({n, c, out_h, out_w}); + ctx.Alloc(v_wn); + ctx.Alloc(v_en); + ctx.Alloc(v_ws); + ctx.Alloc(v_es); + GetGridPointValue(input, v_wn, *x_w, *y_n); + GetGridPointValue(input, v_en, *x_e, *y_n); + GetGridPointValue(input, v_ws, *x_w, *y_s); + GetGridPointValue(input, v_es, *x_e, *y_s); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dd50e7df8f06dad1b4a4e51b48cda8d7e2c91eb --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + if (dim < 0) { + dim += out_grad.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectGradInner(ctx, out_grad, index, x_grad, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectGradInner( + ctx, out_grad, index, x_grad, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + CPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..163174580ff785910cc749711b2f917391a691ff --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -0,0 +1,178 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct IndexSelectAdd { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + for (int i = 0; i < slice_size; i++) { + dist_pointer[i] = src_pointer[i] + p_pointer[i]; + } + } +}; + +template +struct IndexSelectAdd< + Context, + T, + typename std::enable_if::value>::type> { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + auto blas = phi::funcs::GetBlas(ctx); + blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer); + } +}; + +template +void IndexSelectInner(const Context& ctx, + DenseTensor* input, + const DenseTensor& index, + DenseTensor* output, + int dim) { + auto input_dim = input->dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + auto index_size = index.dims()[0]; + + DenseTensor index_cpu_copy; + if (!paddle::platform::is_cpu_place(index.place())) { + phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy); + } + const IndexT* index_data = paddle::platform::is_cpu_place(index.place()) + ? index.data() + : index_cpu_copy.data(); + ctx.template Alloc(output); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_data[i], + 0, + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + PADDLE_ENFORCE_LT( + index_data[i], + input_dim[dim], + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + } + + VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; index_size: " << index_size; + + input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size})); + output->Resize(phi::make_ddim({outer_nums, index_size, slice_size})); + + auto input_tensor = EigenTensor::From(*input); + auto output_tensor = EigenTensor::From(*output); + + auto& place = *ctx.eigen_device(); + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto output_t = output_tensor.chip(j, 1); + output_t.device(place) = input_tensor.chip(index_value, 1); + } + input->Resize(input_dim); + output->Resize(output_dim); +} + +template +void IndexSelectGradInner(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& index, + DenseTensor* x_grad, + int dim) { + const T* input_data = out_grad.data(); + const IndexT* index_data = index.data(); + + const T* p_output = ctx.template Alloc(x_grad); + T* out_data = ctx.template Alloc(x_grad); + + auto input_dim = out_grad.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = x_grad->dims(); + + phi::funcs::SetConstant set_constant; + set_constant(ctx, x_grad, static_cast(0.0)); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; input_width: " << input_width + << "; output_width: " << output_width + << "; index_size: " << index_size; + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto src = input_data + input_start_offset + j * slice_size; + auto p_out = p_output + output_start_offset + index_value * slice_size; + auto dst = out_data + output_start_offset + index_value * slice_size; + IndexSelectAdd index_select_add; + index_select_add(ctx, slice_size, src, p_out, dst); + } + } + x_grad->Resize(output_dim); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5341ede6b2fd846ee3c14d092d166f2832e3bff7 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto inputs = x; + if (dim < 0) { + dim += inputs.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + CPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc index d02268940892e3db932692184343807ebe10c1cf..f849322174d295d95fcd9080e090d5a7ece0ec79 100644 --- a/paddle/phi/kernels/cpu/lgamma_kernel.cc +++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/lgamma_kernel.h" + +#include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/for_range.h" diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 250f656926c0536f71e5724eb9df779c1502a673..0047940fd1704be2862a4a0a4bf46f4886221464 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -19,10 +19,8 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/eigen.h" @@ -55,30 +53,6 @@ namespace phi { } \ } -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - template void DivideRawKernel(const Context& dev_ctx, const DenseTensor& x, @@ -164,20 +138,3 @@ PD_REGISTER_KERNEL(multiply_raw, complex64, complex128, phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - CPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} -PD_REGISTER_KERNEL( - mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 70b6316e1044426a0743c8d5251ca7d210956356..636018ffa68003bc85af22e580bc4ae0768fb1b7 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -23,7 +23,7 @@ #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca813c1757eacce24ecea8687b7b80bd43c5e8f9 --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(x_grad); + + if (axis == in_dims.size() - 1) { + // allocate the memory for the input_grad + // assign the out_grad to input_grad directly + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + if (keepdim) { + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + DDim trans_shape(out_dims); + DDim trans_in_shape(in_dims); + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = out_dims[trans_axis[i]]; + trans_in_shape[i] = in_dims[trans_axis[i]]; + } + // transpose the out_grad, indices + DenseTensor trans_dO; + trans_dO.Resize(trans_shape); + dev_ctx.template Alloc(&trans_dO); + + DenseTensor trans_ind; + trans_ind.Resize(trans_shape); + dev_ctx.template Alloc(&trans_ind); + + int ndims = trans_axis.size(); + + if (keepdim) { + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans_axis); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); + const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; + + // Assign the out_grad to tranpose input_grad + DenseTensor tmp_out; + tmp_out.Resize(trans_in_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + + // Transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_out, x_grad, trans_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + CPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6535d1b89af420ee4266981f004983157179f34f --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_kernel.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + auto out_dims = out->dims(); + // axis < 0, cacluate the real axis + if (axis < 0) axis += in_dims.size(); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + // if axis is not the last dim, transpose it to the last dim, do the + // calculation, then tranpose it back to original axis. + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetMode(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + // get the trans input_dims, out_dims + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + int ndims = trans_axis.size(); + + // transpose the input value + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_out_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + + DenseTensor tmp_indices; + tmp_indices.Resize(trans_out_shape); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + + funcs::GetMode( + input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind); + // transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5a426e93db2cf23962276632fead69565999d37 --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + auto* index = ids.data(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + CPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d9f4c51a981ed8701afe0aa4e7d6a8955f4348c --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto index = ids.data(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + CPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2e32567441ae8ff5315856e3f9132c9553f6d62 --- /dev/null +++ b/paddle/phi/kernels/cpu/qr_kernel.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/kernels/qr_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +namespace phi { + +static inline std::tuple ParseQrMode(const std::string& mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r) { + bool compute_q; + bool reduced_mode; + std::tie(compute_q, reduced_mode) = ParseQrMode(mode); + auto numel = x.numel(); + PADDLE_ENFORCE_GT( + numel, 0, errors::PreconditionNotMet("The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = ctx.template Alloc>( + q, batch_size * m * k * sizeof(phi::dtype::Real)); + } + auto* r_data = ctx.template Alloc>( + r, batch_size * k * n * sizeof(phi::dtype::Real)); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc deleted file mode 100644 index 4fd71f1d0b169866376664bdf2b0b89b13c120e1..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/reduce_any_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_any_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc99e2cb39a6976943ba8fa77f7816c8f5e9b284 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_kernel.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + CPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} +PD_REGISTER_KERNEL( + mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(prod_raw, + CPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} +PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc deleted file mode 100644 index f9ea0aa0faf06918253f9037282b924199e3a314..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_max_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc deleted file mode 100644 index 0a241c81dbe690493b00caf71c0526bb76206e5e..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/reduce_min_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_min_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc deleted file mode 100644 index 9a9bf46e948bc52c6a1e9679b4d3e51b10d89e6d..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_prod_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, - CPU, - ALL_LAYOUT, - phi::ReduceProdKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a91b8b6c1fcd3306521fb7cbc26d8c7adaf2d4f8 --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void bilinear_interpolate_gradient(const int height, + const int width, + T y, + T x, + const T out_grad_this_bin, + const T count, + T* batch_grad_data) { + int x_low, y_low, x_high, y_high; + T w1, w2, w3, w4; + if (y < -1.0 || y > height || x < -1.0 || x > width) { + w1 = w2 = w3 = w4 = 0; + x_low = x_high = y_low = y_high = -1; + return; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + y_low = static_cast(y); + x_low = static_cast(x); + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + *(batch_grad_data + y_low * width + x_low) += diff1; + *(batch_grad_data + y_low * width + x_high) += diff2; + *(batch_grad_data + y_high * width + x_low) += diff3; + *(batch_grad_data + y_high * width + x_high) += diff4; + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + auto in_dims = x.dims(); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = boxes.dims()[0]; + + if (!dx) { + return; + } + + DenseTensor roi_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = roi_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + + if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) { + return; + } + + const T* boxes_data = boxes.data(); + const T* out_grad_data = out_grad.data(); + T* dx_data = dev_ctx.template Alloc(dx); + + auto in_stride = phi::stride(x.dims()); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out_grad.dims()); + + T roi_offset = aligned ? T(0.5) : 0; + for (int n = 0; n < rois_num; ++n) { + int box_batch_idx = box_batch_id_data[n]; + T roi_xmin = boxes_data[0] * spatial_scale - roi_offset; + T roi_ymin = boxes_data[1] * spatial_scale - roi_offset; + T roi_xmax = boxes_data[2] * spatial_scale - roi_offset; + T roi_ymax = boxes_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + for (int c = 0; c < channels; ++c) { + T* batch_grad_data = + dx_data + box_batch_idx * in_stride[0] + c * in_stride[1]; + const T* batch_out_grad_data = + out_grad_data + n * out_stride[0] + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + T out_grad_this_bin = batch_out_grad_data[pool_index]; + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_width / pooled_width); + T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + bilinear_interpolate_gradient(height, + width, + y, + x, + out_grad_this_bin, + count, + batch_grad_data); + } + } + } + } + } + boxes_data += roi_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roi_align_grad, + CPU, + ALL_LAYOUT, + phi::RoiAlignGradKernel, + float, + double, + int) {} diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc index 35ab99a98eba7e59853fb311d5b2307b69ae31b2..4752a9b3a48fdcce5f3211a7aadca663fb44aa05 100644 --- a/paddle/phi/kernels/cpu/roi_align_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -179,7 +179,7 @@ void AvgPool(const std::vector& interpolated_values, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} + roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {} diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..b0d0c0663e4a2eb71f4500baaf43bc8a891acddd --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + std::vector out_vec; + paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = out_grad.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]); + } + + dev_ctx.template Alloc(x_grad); + paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad); + x_grad->Resize(out_grad.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + CPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..25b64ef257dfb801f0050aad388b9fb0b3020ea5 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + std::vector out_vec; + paddle::framework::TensorToVector(x, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = x.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + PADDLE_ENFORCE_EQ( + dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(axis[%d]) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", + i, + input_dim.size(), + input_dim.size() - 1, + i, + dims[i])); + ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]); + } + dev_ctx.template Alloc(out); + paddle::framework::TensorFromVector(out_vec, dev_ctx, out); + out->Resize(x.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + CPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..924e71aff31f3f874fb35586f496b9c5952c3757 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +inline void ShiftAlongDim(T* data, + const DDim& input_dim, + int64_t dim, + int64_t shift) { + if (dim < 0) { + dim += input_dim.size(); + } + if (input_dim[dim] == 0) { + return; + } + shift = shift % input_dim[dim]; + if (shift < 0) { + shift += input_dim[dim]; + } + + auto outer_loops = 1; + for (auto i = 0; i < dim; i++) { + outer_loops *= input_dim[i]; + } + auto slice_width = 1; + for (auto i = dim + 1; i < input_dim.size(); i++) { + slice_width *= input_dim[i]; + } + + VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim + << "; dim: " << dim << "; shift: " << shift + << "; outer_loops: " << outer_loops + << "; slice_width: " << slice_width; + if (shift == 0) { + return; + } + + std::vector head; + auto head_size = slice_width * (input_dim[dim] - shift); + head.resize(head_size); + + for (auto i = 0; i < outer_loops; i++) { + for (auto j = 0; j < head_size; j++) { + head[j] = data[i * input_dim[dim] * slice_width + j]; + } + for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { + auto dst_pos = j - input_dim[dim] + shift; + for (auto k = 0; k < slice_width; k++) { + data[(i * input_dim[dim] + dst_pos) * slice_width + k] = + data[(i * input_dim[dim] + j) * slice_width + k]; + } + } + for (auto j = 0; j < head_size; j++) { + data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc index 585c27bdcec97e11a68cdc536c829f76c000a8df..a5c9dc4c55e495833f40ec7499e6c0373594d319 100644 --- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc index d0413457f8177338aa450211539dc16d0880c74c..ad76a7a86bcb28f291288418c43740ed0b7adb97 100644 --- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc @@ -18,5 +18,11 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + CPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc similarity index 74% rename from paddle/phi/kernels/cpu/shape_kernel.cc rename to paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc index 073dc63b2a4348d4091af8c285f9ddebd799acc5..14aca258a2c71a0651868f6917e2707987179ee0 100644 --- a/paddle/phi/kernels/cpu/shape_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc @@ -12,22 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/shape_kernel.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL(shape, +PD_REGISTER_KERNEL(tril_triu_grad, CPU, ALL_LAYOUT, - phi::ShapeKernel, + phi::TrilTriuGradKernel, bool, - int, - int8_t, - uint8_t, - int64_t, float, double, - phi::dtype::complex, - phi::dtype::complex) {} + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc similarity index 52% rename from paddle/phi/kernels/cpu/reduce_all_kernel.cc rename to paddle/phi/kernels/cpu/tril_triu_kernel.cc index 3e8e38ee4447e67359e694700504c1041d0a15e7..a3d20e55e21fb6e11f63ef05f5de63fbc51caf5e 100644 --- a/paddle/phi/kernels/cpu/reduce_all_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc @@ -12,26 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_all_kernel.h" +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" -namespace phi { - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} +PD_REGISTER_KERNEL(tril_triu, + CPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/reduce_all_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h similarity index 61% rename from paddle/phi/kernels/reduce_all_kernel.h rename to paddle/phi/kernels/determinant_grad_kernel.h index 8d7a9ab3faf39c49dd70213ec3edfa98b6e4e406..87228afc51b52bb95838d3161f372c2ebae19b2c 100644 --- a/paddle/phi/kernels/reduce_all_kernel.h +++ b/paddle/phi/kernels/determinant_grad_kernel.h @@ -19,17 +19,10 @@ namespace phi { template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad); -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/reduce_min_kernel.h b/paddle/phi/kernels/determinant_kernel.h similarity index 61% rename from paddle/phi/kernels/reduce_min_kernel.h rename to paddle/phi/kernels/determinant_kernel.h index 3227ec00e649e520e455fc3b2122cb88b51fc13e..abd5f5691b3e5df6274db1bcd3a915c80bf93ec4 100644 --- a/paddle/phi/kernels/reduce_min_kernel.h +++ b/paddle/phi/kernels/determinant_kernel.h @@ -19,17 +19,8 @@ namespace phi { template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 5b2e70ceb54d9a82fb5d3fd9e19adcbeb34bd161..372834d657ad48c58a2b53653143d580c29108f6 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -29,11 +29,17 @@ #include #include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" + +#ifdef PADDLE_WITH_XPU_KP +#define __forceinline__ __inline__ +#endif namespace phi { namespace funcs { @@ -929,6 +935,236 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + out.device(d) = x * (temp1 || temp2).template cast(); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + dx.device(d) = dout * (temp1 || temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + (x < static_cast(0)) + .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + dx.device(d) = (out > static_cast(0)) + .select(dout, dout * (out + static_cast(alpha))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + dx.device(d) = (x > static_cast(0)) + .select(dout, dout * static_cast(alpha) * x.exp()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* ddX, + DenseTensor* ddOut, + const DenseTensor* dOut, + DenseTensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); + + if (dX) { + auto dx = EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); + dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { @@ -1508,6 +1744,209 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; + +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + __device__ __forceinline__ T operator()(const T x) const { + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; + +template +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaTanhShrinkFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanhshrink(x) = x - tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x - tanh(x)); + } +}; + +template +struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * tanh(x)^2 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * tanh(x) * tanh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardShrinkFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x + __device__ __forceinline__ T operator()(const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : x; + } +}; + +template +struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (x > -threshold && x < threshold) ? 0 : dout + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaELUFunctor : public BaseActivationFunctor { + using CT = typename phi::dtype::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // elu(x) = x, if x > 0 + // elu(x) = alpha * (e^x - 1), if x <= 0 + __device__ __forceinline__ T operator()(const T arg_x) const { + CT x = static_cast(arg_x); + CT temp = static_cast(alpha) * (exp(x) - one); + CT res = x > zero ? x : temp; + return static_cast(res); + } +}; + +template +struct CudaELUGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType a = static_cast(alpha); + MPType out_pos = static_cast(out > zero); + MPType out_neg = static_cast(out <= zero); + return static_cast(dout * (out_pos + out_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_out, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType x = static_cast(arg_x); + MPType a = static_cast(alpha); + MPType x_pos = static_cast(x > zero); + MPType x_neg = static_cast(x <= zero); + return static_cast(dout * (x_pos + x_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSiluFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x / (one + exp(-x))); + } +}; + +template +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h new file mode 100644 index 0000000000000000000000000000000000000000..1b7641762e2639acf3db540280891b518f22eed2 --- /dev/null +++ b/paddle/phi/kernels/funcs/mode.h @@ -0,0 +1,197 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#ifdef PADDLE_WITH_MKLML +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +static int ComputeBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +static inline void GetDims( + const phi::DDim& dim, int axis, int* pre, int* n, int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template +static void GetMode(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + std::sort(col_vec.begin(), + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + T mode = 0; + int64_t indice = 0; + int64_t cur_freq = 0; + int64_t max_freq = 0; + for (int64_t i = 0; i < input_width; ++i) { + ++cur_freq; + if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { + if (cur_freq > max_freq) { + max_freq = cur_freq; + mode = col_vec[i].first; + indice = col_vec[i].second; + } + cur_freq = 0; + } + } + t_out[i] = mode; + t_indices[i] = indice; + } +} + +template +static void ModeAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +static void GetModebySort(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + T* out_tensor, + int64_t* indices_tensor) { + DenseTensor input_tmp; + input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); + T* input_tmp_data = dev_ctx.Alloc(&input_tmp); + phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp); + + thrust::device_ptr out_tensor_ptr(out_tensor); + thrust::device_ptr indices_tensor_ptr(indices_tensor); + + for (int64_t i = 0; i < num_rows; ++i) { + T* begin = input_tmp_data + num_cols * i; + T* end = input_tmp_data + num_cols * (i + 1); + thrust::device_vector indices_data(num_cols); + thrust::sequence( + thrust::device, indices_data.begin(), indices_data.begin() + num_cols); + thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); + int unique = 1 + thrust::inner_product(thrust::device, + begin, + end - 1, + begin + 1, + 0, + thrust::plus(), + thrust::not_equal_to()); + thrust::device_vector keys_data(unique); + thrust::device_vector cnts_data(unique); + thrust::reduce_by_key(thrust::device, + begin, + end, + thrust::constant_iterator(1), + keys_data.begin(), + cnts_data.begin()); + auto it = thrust::max_element( + thrust::device, cnts_data.begin(), cnts_data.begin() + unique); + T mode = keys_data[it - cnts_data.begin()]; + int64_t counts = cnts_data[it - cnts_data.begin()]; + auto pos = thrust::find(thrust::device, begin, end, mode); + int64_t index = indices_data[pos - begin + counts - 1]; + out_tensor_ptr[i] = static_cast(mode); + indices_tensor_ptr[i] = static_cast(index); + } +} +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 4cf5e1c02c59757ee8bd0ae91c18d0882b702da1..417c1cd234754f994383988c63ff44ba06794822 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // paddle::platform::ChangeThreadNum(context, &thread_num); + // backends::gpu::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -460,7 +460,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -527,7 +527,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1293,7 +1293,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1369,7 +1369,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc index bf4a21f37223dab5a67649406496e9828b0bcf3f..fbd744430aa11ab1a5a17c76b6d37c10c3085556 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cc +++ b/paddle/phi/kernels/funcs/segment_pooling.cc @@ -149,10 +149,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 305cd39f077bc359543b399a8775b5a92a2eb00d..95606b152672916116813c97cbbc0856d33e49a7 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -453,10 +453,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index 68fe8880a971dd7a56d677a5567bb053f5ba117a..d82d793e5343a48306572068722e2fe587c0aa57 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint( } inline void GetOutShape(const DDim& x_dims, - const DDim& kernel_dims, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims, x_dims.size(), 5, phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); - PADDLE_ENFORCE_EQ(kernel_dims.size(), + PADDLE_ENFORCE_EQ(kernel_sizes.size(), 5, phi::errors::InvalidArgument( "the shape of kernel should be (D, H, W, C, OC)")); // infer out shape (*out_dims)[0] = x_dims[0]; - (*out_dims)[4] = kernel_dims[4]; + (*out_dims)[4] = kernel_sizes[4]; for (int i = 1; i < 4; i++) { (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - - dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / + dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) / strides[i - 1] + 1; } @@ -131,7 +131,7 @@ template inline void SubmPreProcess(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const int in_channels, const int out_channels, const int half_kernel_size, @@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx, blas.GEMM(CblasTrans, CblasNoTrans, x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], x.non_zero_elements().dims()[0], static_cast(1), x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), + out_grad.data(), static_cast(0), d_kernel_ptr + half_kernel_size * in_channels * out_channels); @@ -155,11 +155,11 @@ inline void SubmPreProcess(const Context& dev_ctx, T* x_grad_ptr = x_grad->data(); blas.GEMM(CblasNoTrans, CblasTrans, - out_grad.non_zero_elements().dims()[0], + out_grad.dims()[0], in_channels, - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], static_cast(1), - out_grad.non_zero_elements().data(), + out_grad.data(), kernel.data() + half_kernel_size * in_channels * out_channels, static_cast(0), x_grad_ptr); diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d2b6f1e559d2b5bfc333b60359f5b1e56e9aaadb --- /dev/null +++ b/paddle/phi/kernels/funcs/tril_triu_compute.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +template +class TrilTriuCompute { + public: + HOSTDEVICE TrilTriuCompute(const T* in, + const int diagonal, + const bool lower, + const int64_t H, + const int64_t W, + T* out) + : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} + + HOSTDEVICE void operator()(int64_t idx) { + const int64_t row = (idx / W_) % H_; + const int64_t col = idx % W_; + const bool mask = + lower_ ? (col - row > diagonal_) : (col - row < diagonal_); + out_[idx] = mask ? static_cast(0) : in_[idx]; + } + + private: + const T* in_; + const int diagonal_; + const bool lower_; + const int64_t H_; + const int64_t W_; + T* out_; +}; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h similarity index 61% rename from paddle/phi/kernels/reduce_max_kernel.h rename to paddle/phi/kernels/gather_grad_kernel.h index 49a350519c506b15a54d41b969dc65b679cc4d06..e53da7b471c7b82efef2319915cc57537ee824b5 100644 --- a/paddle/phi/kernels/reduce_max_kernel.h +++ b/paddle/phi/kernels/gather_grad_kernel.h @@ -14,22 +14,18 @@ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad); -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/reduce_any_kernel.h b/paddle/phi/kernels/gather_kernel.h similarity index 68% rename from paddle/phi/kernels/reduce_any_kernel.h rename to paddle/phi/kernels/gather_kernel.h index 0f505817084e792a45c626430eb4e3d7d5a485aa..78ac09125b69298c59622fc69469ba8d28cae919 100644 --- a/paddle/phi/kernels/reduce_any_kernel.h +++ b/paddle/phi/kernels/gather_kernel.h @@ -14,22 +14,16 @@ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { template -void AnyRawKernel(const Context& dev_ctx, +void GatherKernel(const Context& dev_ctx, const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, + const DenseTensor& index, + const Scalar& axis, DenseTensor* out); -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 40cea77722d04bb6c069ca1132625fbec309893d..8d488d6d9cb587ec876b6ebc36e00338d5af0b03 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, } } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ name, functor_class, attr1, attr2) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ @@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -142,33 +142,63 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Exp, CudaExpGradFunctor); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, alpha); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, CudaThresholdedReluGradFunctor, threshold); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + CudaSoftShrinkGradFunctor, + lambda); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + CudaHardShrinkGradFunctor, + threshold); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, CudaBReluGradFunctor, t_min, t_max); +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + std::vector ins = {&dout, &out}; + std::vector outs = {dx}; + if (alpha > 0) { + funcs::CudaELUGradFunctor functor; + functor.alpha = alpha; + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else { + funcs::CudaELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + ins.push_back(&x); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -235,6 +265,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +<<<<<<< HEAD PD_REGISTER_KERNEL(exp_grad, GPU, @@ -244,3 +275,11 @@ PD_REGISTER_KERNEL(exp_grad, double, int, int64_t) {} +======= +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) +>>>>>>> 6849d33b62cacccb27797375a212e37a47ca9484 diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index a93b4daa95d6ee6687a4275a214f583813fb9ad2..106d3b9cabd9c22939c8c522d62a2f69dec6a7b8 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -39,12 +39,13 @@ void ActivationGPUImpl(const Context& dev_ctx, funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); } -#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationGPUImpl(dev_ctx, x, out, functor); \ +#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ } #define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ @@ -76,6 +77,7 @@ void ActivationGPUImpl(const Context& dev_ctx, dev_ctx, x, out, functor); \ } +<<<<<<< HEAD DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) @@ -96,11 +98,33 @@ DEFINE_GPU_ACTIVATION_KERNEL(Square, funcs::CudaSquareFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, funcs::CudaSqrtFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, funcs::CudaRsqrtFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Softsign, funcs::CudaSoftsignFunctor) +======= +DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) +>>>>>>> 6849d33b62cacccb27797375a212e37a47ca9484 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, CudaThresholdedReluFunctor, threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, + CudaHardShrinkFunctor, + threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold) @@ -157,6 +181,7 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +<<<<<<< HEAD PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel) PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel) @@ -177,3 +202,10 @@ PD_REGISTER_KERNEL(expm1, PD_REGISTER_KERNEL(logit, GPU, ALL_LAYOUT, phi::LogitKernel, float, double) {} PD_REGISTER_KERNEL( square, GPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {} +======= +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +>>>>>>> 6849d33b62cacccb27797375a212e37a47ca9484 diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu similarity index 51% rename from paddle/phi/kernels/gpu/reduce_all_kernel.cu rename to paddle/phi/kernels/gpu/determinant_grad_kernel.cu index 2963d3f206c2d7737e1ca13c91f69ae94a6a6f77..cce12a87fac72f5ac6edbbeb74de9fe3ae9ede09 100644 --- a/paddle/phi/kernels/gpu/reduce_all_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu @@ -12,25 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_all_kernel.h" +#include "paddle/phi/kernels/determinant_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" -namespace phi { - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} +PD_REGISTER_KERNEL(determinant_grad, + GPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..25184083873952638a1f84d8d4b66262363ca9c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +PD_REGISTER_KERNEL( + determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..04149a2f9ee41e797a66eedcb2d797fb87519041 --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == DataType::INT32) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(x_grad); + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (out_grad.numel() == 0) return; + if (index_type == DataType::INT32) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } else if (index_type == DataType::INT64) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + GPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7e0c6cc168564e94c5af2e26a8f9ba4acc0594ed --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) return; + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + GPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int64_t, + int, + int16_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..457a348be832b006d9f224e3032c369a7fe4bb62 --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu @@ -0,0 +1,324 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +template +static __forceinline__ __device__ void AtomicAdd( + T* data, int h, int w, int sH, int sW, int H, int W, T delta) { + if (InBounds(h, w, H, W)) { + paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta); + } +} + +template +static __forceinline__ __device__ T +UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) { + if (align_corners) { + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexesWithMask(T in, + int clip_limit, + T* grad_in) { + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + T max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +template +static __forceinline__ __device__ T +ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T +ComputePositionsWithMask(T coord, + int size, + PaddingMode padding_mode, + bool align_corners, + T* grad_in) { + T grad_clip, grad_refl; + coord = UnnormalizeWithMask(coord, size, align_corners, grad_in); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl); + } else { + coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl); + } + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + return coord; +} + +template +__global__ void GridSamplerCudaBackwardKernel(const int nthreads, + const T* grad_output, + const T* input, + const T* grid, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + T* grad_input, + T* grad_grid, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + + int gOut_sN = out_c * out_h * out_w; + int gOut_sC = out_h * out_w; + int gOut_sH = out_w; + int gOut_sW = 1; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + T gix_mult, giy_mult; + ix = ComputePositionsWithMask( + ix, in_w, padding_mode, align_corners, &gix_mult); + iy = ComputePositionsWithMask( + iy, in_h, padding_mode, align_corners, &giy_mult); + + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + T gix = static_cast(0), giy = static_cast(0); + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + int inp_offset_NC = n * inp_sN; + for (int c = 0; c < out_c; ++c, + inp_offset_NC += inp_sC, + gInp_ptr_NC += inp_sC, + gOut_offset += gOut_sC) { + T gOut = grad_output[gOut_offset]; + + AtomicAdd( + gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut); + AtomicAdd( + gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut); + + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = gix_mult * gix; + gGrid_ptr_NHW[1] = giy_mult * giy; + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (int c = 0; c < out_c; + ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { + AtomicAdd(gInp_ptr_NC, + iy_nearest, + ix_nearest, + inp_sH, + inp_sW, + in_h, + in_w, + grad_output[gOut_offset]); + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = static_cast(0); + gGrid_ptr_NHW[1] = static_cast(0); + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* grid_grad_data = nullptr; + if (grid_grad != nullptr) { + grid_grad_data = dev_ctx.template Alloc(grid_grad); + } + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSamplerCudaBackwardKernel< + T><<>>( + count, + out_grad.data(), + x.data(), + grid.data(), + n, + c, + out_h, + out_w, + in_h, + in_w, + x_grad->data(), + grid_grad_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + GPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f611b46911c4f1555ad27a538d8918f11ae761cc --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu @@ -0,0 +1,233 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +namespace phi { + +template +static __forceinline__ __device__ T Unnormalize(T coord, + int size, + bool align_corners) { + if (align_corners) { + return ((coord + 1.f) / 2) * (size - 1); + } else { + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexes(T in, int max_value) { + return min(static_cast(max_value), max(in, static_cast(0))); +} + +template +static __forceinline__ __device__ T ReflectIndexes(T in, + int twice_low, + int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = fabs(in - min); + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T ComputePositions(T coord, + int size, + PaddingMode padding_mode, + bool align_corners) { + coord = Unnormalize(coord, size, align_corners); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexes(coord, size - 1); + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexes(coord, 0, 2 * (size - 1)); + } else { + coord = ReflectIndexes(coord, -1, 2 * size - 1); + } + coord = ClipIndexes(coord, size - 1); + } + return coord; +} + +template +__global__ void GridSampleCudaKernel(const int nthreads, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + const T* input, + const T* grid, + T* output, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + int out_sN = out_c * out_h * out_w; + int out_sC = out_h * out_w; + int out_sH = out_w; + int out_sW = 1; + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + ix = ComputePositions(ix, in_w, padding_mode, align_corners); + iy = ComputePositions(iy, in_h, padding_mode, align_corners); + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + auto inp_offset_NC = n * inp_sN; + + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + *out_ptr_NCHW = static_cast(0); + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; + } + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + auto inp_offset_NC = n * inp_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) { + *out_ptr_NCHW = + input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; + } else { + *out_ptr_NCHW = static_cast(0); + } + } + } + } +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h + << "; out_w: " << out_w; + + auto* output_data = dev_ctx.template Alloc(out); + VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " + << out->dims()[2] << "; " << out->dims()[3]; + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSampleCudaKernel< + T><<>>( + count, + n, + c, + out_h, + out_w, + in_h, + in_w, + x.data(), + grid.data(), + output_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..098eb9defb54904c41f33326b54eabdda657360a --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_utils.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { + +enum class Mode { + bilinear, + nearest, +}; + +enum class PaddingMode { zeros, border, reflect }; + +static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a393eecd51242193fa3b2192ff8e8f1111d350b6 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_grad_cuda_kernel(const T* output_grad, + T* input_grad, + const IndexT* index, + int64_t nums, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); +} + +template +__global__ void index_select_grad_init(T* input_grad, int64_t N) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + input_grad[idx] = 0.0; +} + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + auto* output_grad_data = out_grad.data(); + auto* in_grad_data = ctx.template Alloc(x_grad); + + auto input_dim = x_grad->dims(); + auto output_dim = out_grad.dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + int64_t numel = x_grad->numel(); + int64_t index_nums = index.numel(); + int64_t out_nums = out_grad.numel(); + + auto stream = ctx.stream(); + + index_select_grad_init< + T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_grad_data, numel); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + GPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f774522318acb8f44798030870886dd1dc7accc1 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_cuda_kernel(const T* input, + T* output, + const IndexT* index, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + output[idx] = input[input_idx]; +} + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto input_dim = x.dims(); + auto output_dim = output->dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + auto* in_data = x.data(); + T* out_data = ctx.template Alloc(output); + + int64_t numel = output->numel(); + auto stream = ctx.stream(); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_cuda_kernel<<< + (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_cuda_kernel< + T, + int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + GPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index af9d5574aa9feaf4d44482bbf0e75f31a5139595..d33f216468220da7ef9fc09533226e8fdd0c702f 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -56,30 +56,6 @@ namespace phi { * Kernels */ -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - // Create the definition of Add DEFINE_CUDA_ELEMENTWISE_OP(Add) // Create the definition of Subtract @@ -147,30 +123,3 @@ PD_REGISTER_KERNEL(multiply_raw, complex64, complex128, bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - GPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - float16, - bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL(mean_raw, - GPU, - ALL_LAYOUT, - phi::MeanRawKernel, - float, - double, - bool, - float16, - int, - int64_t) {} diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 2009547fc8d6f18c488faab5fd57cc985990229b..7796132ec07f433d8495d1dba197c06d536e1338 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -28,7 +28,7 @@ #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..43502621c2d3a878a144de1878aa09b8d64b6a47 --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu @@ -0,0 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +__global__ void AssignGradWithAxis(const T* grad_out, + const int64_t* indices, + T* grad_in, + int pre, + int post, + int raw_height, + int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + int base_index = i * post * k; + int base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + __syncthreads(); + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + int64_t idx_ij = indices[base_index + j]; + int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[base_index + j]; + } + } +} + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + + if (axis < 0) axis += in_dims.size(); + // allocate the cuda memory for the x_grad + T* x_grad_data = dev_ctx.template Alloc(x_grad); + const T* out_grad_data = out_grad.data(); + const int64_t* indices_data = indices.data(); + + int pre, n, post; + funcs::GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + int block_size = funcs::ComputeBlockSize(post); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + GPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..629b9722cd6bcfe12d0fb5a7e8be6439f5ea286f --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + // get the input dims + const auto& in_dims = x.dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto out_dims = out->dims(); + + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetModebySort( + dev_ctx, &x, input_width, input_height, output_data, indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + for (int i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + // second step, tranpose the input + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + + int ndims = trans_axis.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + DenseTensor trans_ind; + trans_ind.Resize(trans_out_shape); + int64_t* trans_ind_data = dev_ctx.template Alloc(&trans_ind); + + DenseTensor trans_out; + trans_out.Resize(trans_out_shape); + T* trans_out_data = dev_ctx.template Alloc(&trans_out); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + funcs::GetModebySort(dev_ctx, + &trans_input, + input_width, + input_height, + trans_out_data, + trans_ind_data); + // last step, tranpose back the indices and output + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans_axis); + funcs::TransCompute(ndims, dev_ctx, trans_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..21576ab608d269340322782c8113c6054c791e74 --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T), + stream); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + GPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..743448a46866687cf2ac68be522a306281289252 --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T), + stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + GPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index d4d90cac917a2c35e26eca0d57d1c5349b878599..92948bf47c9345e53d1fea54d2be4fd8efbc6f96 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -14,37 +14,161 @@ #include "paddle/phi/kernels/randperm_kernel.h" +#ifdef __NVCC__ +#include +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +#include +namespace cub = hipcub; +#endif +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/randint_kernel.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { +template +__global__ void SwapRepeatKernel( + int* key, T* data, int n, uint64_t seed, uint64_t offset) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < n) return; + + bool first_repeat = false; + if (data[idx] == data[idx + 1]) { + if (idx == 0) { + first_repeat = true; + } else if (data[idx] != data[idx - 1]) { + first_repeat = true; + } + } + + if (!first_repeat) return; + + int repeat_size = 1; + for (int i = idx; i < n; ++i) { + if (data[i] == data[i + 1]) { + ++repeat_size; + } else { + break; + } + } + +#ifdef __NVCC__ + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = curand(&state) % (i + 1); +#elif __HIPCC__ + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = hiprand(&state) % (i + 1); +#endif + if (r != i) { + T tmp = data[idx + i]; + data[idx + i] = data[idx + r]; + data[idx + r] = tmp; + } + } +} + template void RandpermRawKernel( const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim({n})); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + DenseTensor key; + RandintKernel(dev_ctx, + std::numeric_limits::min(), + std::numeric_limits::max(), + ScalarArray({n}), + phi::DataType::INT32, + &key); + DenseTensor key_out = Empty(dev_ctx, ScalarArray({n})); + + DenseTensor range = Empty(dev_ctx, ScalarArray({n})); + T* range_data = range.data(); + funcs::ForRange for_range(dev_ctx, n); + for_range([range_data] __device__(size_t idx) { + range_data[idx] = static_cast(idx); + }); + + out->Resize(phi::make_ddim({n})); + T* out_data = dev_ctx.template Alloc(out); + + // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to + // improve performance of radix sort. + double n_d = static_cast(n); + int begin_bit = 0; + int end_bit = + std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto gen_cuda = dev_ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(n); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + SwapRepeatKernel<<>>( + key_out.data(), out_data, n, seed, offset); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim({n})); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - for (int i = 0; i < n; ++i) { - tmp_data[i] = static_cast(i); - } - std::shuffle(tmp_data, tmp_data + n, *engine); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } - T* out_data = dev_ctx.template Alloc(out); - auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); - paddle::memory::Copy( - out->place(), out_data, tmp.place(), tmp_data, size, 0); + for (int i = 0; i < n; ++i) { + tmp_data[i] = static_cast(i); + } + std::shuffle(tmp_data, tmp_data + n, *engine); + + T* out_data = dev_ctx.template Alloc(out); + auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); + paddle::memory::Copy( + out->place(), out_data, tmp.place(), tmp_data, size, 0); + } } template diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6cbe699e8e05831b049536b06b1fdadcc145537d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -0,0 +1,158 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + GPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + float16, + bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL(mean_raw, + GPU, + ALL_LAYOUT, + phi::MeanRawKernel, + float, + double, + bool, + float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(prod_raw, + GPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} + +PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu index 98c3986c51dd6829287f5316ae9eb52f328372ab..ddbc08b06c84b0afe42091ddf9a53a928621ef6d 100644 --- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf076128b69396196f59a8accd0c282322f8f49a --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -0,0 +1,260 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__device__ void BilinearInterpolateGradient(const int height, + const int width, + T y, + T x, + T* w1, + T* w2, + T* w3, + T* w4, + int* x_low, + int* x_high, + int* y_low, + int* y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return; + } + + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + *y_low = static_cast(y); + *x_low = static_cast(x); + if (*y_low >= height - 1) { + *y_high = *y_low = height - 1; + y = static_cast(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = *x_low = width - 1; + x = static_cast(*x_low); + } else { + *x_high = *x_low + 1; + } + T ly = y - *y_low, lx = x - *x_low; + T hy = 1. - ly, hx = 1. - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + + return; +} + +template +__global__ void GPURoiAlignBackward(const int nthreads, + const T* input_rois, + const T* out_grad, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, + T* input_grad, + const bool continuous_coordinate) { + CUDA_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_offset = continuous_coordinate ? T(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_input_grad = + input_grad + (roi_batch_ind * channels + c) * height * width; + + const T* offset_out_grad = + out_grad + (n * channels + c) * pooled_height * pooled_width; + const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + const T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T w1 = 0, w2 = 0, w3 = 0, w4 = 0; + int x_low = -1, x_high = -1, y_low = -1, y_high = -1; + BilinearInterpolateGradient(height, + width, + y, + x, + &w1, + &w2, + &w3, + &w4, + &x_low, + &x_high, + &y_low, + &y_high); + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_low, diff1); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_high, diff2); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_low, diff3); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_high, diff4); + } + } + } + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + int rois_num = boxes.dims()[0]; + int channels = x.dims()[1]; + int height = x.dims()[2]; + int width = x.dims()[3]; + + if (!dx) { + return; + } + + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_size = dev_ctx.template HostAlloc(&box_batch_id_list); + + auto cplace = phi::CPUPlace(); + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(cplace, + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_size[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_size[i] = n; + } + } + } + auto roi_ptr = + paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int)); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + int bytes = box_batch_id_list.numel() * sizeof(int); + paddle::memory::Copy( + gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream()); + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPURoiAlignBackward<<>>( + output_grad_size, + boxes.data(), + out_grad.data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + roi_id_data, + dx->data(), + aligned); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index 2f906fa4f663b6da65a3e986af2214dfb49f2ec0..cd4ed29cdd1dd7b48a9135597ca79ab401a0cfba 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -71,7 +71,7 @@ __device__ T BilinearInterpolate( } template -__global__ void GPUROIAlignForward(const int nthreads, +__global__ void GPURoiAlignForward(const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, @@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx, int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); paddle::memory::Copy( gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); - GPUROIAlignForward<<>>( + GPURoiAlignForward<<>>( output_size, x.data(), boxes.data(), @@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} + roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..93e9e81882c9e6eacd5f9ee91fa7541495ef2663 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + auto* in_data = out_grad.data(); + T* out_data = dev_ctx.template Alloc(x_grad); + int64_t numel = out_grad.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + auto input_dim = out_grad.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + if (size != 0) { + shifts_data[i] = ((-shifts_data[i]) % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + GPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1543335d3a0c5884d6b82394253bb4e8dda8cef0 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + auto* in_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); + int64_t numel = x.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + + size_t nums = shifts_data.size(); + auto input_dim = x.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = (shifts_data[0] % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + + if (size != 0) { + shifts_data[i] = (shifts_data[i] % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + GPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..abe3ee470b4bc6b3951e1ad2da09544e319cbcac --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void RollCudaKernel(const T* input, + T* output, + int64_t N, + phi::Array shifts, + phi::Array strides, + phi::Array sizes) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t output_idx = idx; + int64_t new_dim_idx = 0; + +#pragma unroll + for (size_t i = 0; i < Rank; i++) { + new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; + if (new_dim_idx >= sizes[i]) { + output_idx += (shifts[i] - sizes[i]) * strides[i]; + } else { + output_idx += shifts[i] * strides[i]; + } + } + output[output_idx] = input[idx]; +} + +#define CALL_ROLL_CUDA_KERNEL(N) \ + case N: { \ + phi::Array _strides; \ + phi::Array _shifts; \ + phi::Array _sizes; \ + for (size_t idx = 0; idx < N; ++idx) { \ + _strides[idx] = strides[idx]; \ + _shifts[idx] = shifts_data[idx]; \ + _sizes[idx] = sizes[idx]; \ + } \ + RollCudaKernel< \ + T, \ + N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ + PADDLE_CUDA_NUM_THREADS, \ + 0, \ + stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes); \ + break; \ + } + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu index d9618dc159a6d3f5b24bdfcfdb219ec649e051f9..9d1769e18b4b809fbc353513a05553e0ccd97572 100644 --- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu @@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu index c38e935adf837ef00c48fa31bc1e37eea2948673..3128e534166acba6ca136331ad8efea66b18621f 100644 --- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu @@ -19,5 +19,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + GPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu similarity index 53% rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu rename to paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu index 278d4a6e5ab79a7519e1052a2d05c6ecda62692f..bc3ef1bc623bb27ac2452d1e908c389543598011 100644 --- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu @@ -12,32 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, +PD_REGISTER_KERNEL(tril_triu_grad, GPU, ALL_LAYOUT, - phi::ReduceProdKernel, + phi::TrilTriuGradKernel, + bool, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c48edf9eff25aa68abcfe0b08dd7ab659aaa0fb --- /dev/null +++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu, + GPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 2b2dd5118969cf35c4762f3ab774ce41c04d2e4d..77159bfc876da603f703a13592f525d808adfbbf 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -121,17 +121,10 @@ struct ReduceMaxFunctor { }; template -struct ExpSubFunctor { - HOSTDEVICE inline ExpSubFunctor() { y = static_cast(0.0f); } - - HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {} - +struct ExpFunctor { HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(std::exp(x - y)); + return static_cast(std::exp(x)); } - - private: - Tx y; }; template @@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax, } // data src - AccT srcdata[kBatchSize][kLoopsV][kVSize]; - T src_tmp[kBatchSize][kLoopsV][kVSize]; - kps::Init(&srcdata[0][0][0], kLowInf); - kps::Init(&src_tmp[0][0][0], -std::numeric_limits::infinity()); + // src_data: the raw data form global memory + // sub_data: store the data obtained by (src_data - max), used by log_softmax + // exp_data: store the data obtained by (exp(sub_data)), used by softmax + T src_data[kBatchSize][kLoopsV][kVSize]; + AccT sub_data[kBatchSize][kLoopsV][kVSize]; + AccT exp_data[kBatchSize][kLoopsV][kVSize]; + kps::Init(&sub_data[0][0][0], kLowInf); + kps::Init(&src_data[0][0][0], -std::numeric_limits::infinity()); // data dst T out_tmp[kBatchSize][kLoopsV][kVSize]; @@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax, for (int i = 0; i < kBatchSize; ++i) { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); - VecT* reg_v = reinterpret_cast(&src_tmp[i][0][0]); + VecT* reg_v = reinterpret_cast(&src_data[i][0][0]); kps::ReadData( ®_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1); kps::ElementwiseUnary>( - &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor()); + &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor()); } // compute max @@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax, 1, ReduceMaxFunctor, kMode::kLocalMode>( - &max[0], &srcdata[0][0][0], ReduceMaxFunctor(), true); + &max[0], &sub_data[0][0][0], ReduceMaxFunctor(), true); WarpReduceMax(max); // compute sum #pragma unroll for (int i = 0; i < kBatchSize; ++i) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor(max[i])); + kps::ElementwiseUnary>( + &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor(max[i])); + kps::ElementwiseUnary>( + &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor()); } kps::Reduce, kMode::kLocalMode>( - &sum[0], &srcdata[0][0][0], kps::AddFunctor(), true); + &sum[0], &exp_data[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); // write data to global memory @@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax, reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); if (LogMode) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); kps::ElementwiseUnary>( &out_tmp[i][0][0], - &srcdata[i][0][0], + &sub_data[i][0][0], UnarySubFunctor(std::log(sum[i]))); } else { kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor(sum[i])); } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); diff --git a/paddle/phi/kernels/grid_sample_grad_kernel.h b/paddle/phi/kernels/grid_sample_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..50a8d5be260bd387476467e2cdddaeb59f943b9b --- /dev/null +++ b/paddle/phi/kernels/grid_sample_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GridSampleGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const DenseTensor &out_grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *x_grad, + DenseTensor *grid_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2e1e9b508649b22de086a103537f4984b7f693e5 --- /dev/null +++ b/paddle/phi/kernels/grid_sample_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GridSampleKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index a48a6226f23f8d9976dc86e59b051828b1d71b21..a95f49c0e7cfd32802f1d1899a1fe1590fdf6a87 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -202,4 +202,24 @@ void TanhTripleGradKernel(const Context& dev_ctx, d_ddx); // output } +template +void EluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + float alpha, + DenseTensor* dx, + DenseTensor* ddout) { + if (dx) { + dx->Resize(x.dims()); + dev_ctx.template Alloc(dx); + } + if (ddout) { + dev_ctx.template Alloc(ddout); + } + funcs::ELUGradGradFunctor functor; + functor.alpha = alpha; + functor(dev_ctx, &x, &ddx, ddout, &dout, dx); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h index 9f557e746378939e32a32955e758cdc5c510f229..72741e6d3a01ae374c43a24ac519ff5106b5733e 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h @@ -24,13 +24,12 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx, const auto H = y_bst_dims_vec[y_bst_ndim - 2]; const auto W = y_bst_dims_vec[y_bst_ndim - 1]; phi::funcs::ForRange y_for_range(dev_ctx, dy_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dy_bst.data(), 0, !upper, H, W, dy_bst_upper.data()); y_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..038ef0c214bc73b41fc3aff661e296207d615df1 --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -0,0 +1,159 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace detail { + +template +struct FoundZeroFunctor { + FoundZeroFunctor(const T* x, int64_t numel, bool* res) + : x_(x), numel_(numel), res_(res) {} + HOSTDEVICE void operator()(size_t idx) const { + if (*res_ || idx >= static_cast(numel_)) { + // founded zero number + return; + } + *res_ = (x_[idx] == static_cast(0)); + } + const T* x_; + int64_t numel_; + bool* res_; +}; + +template +inline bool CheckMatrixInvertible(const Context& dev_ctx, + const DenseTensor* det) { + auto numel = det->numel(); + + DenseTensor dev_tensor = phi::Empty(dev_ctx, {1}); + + // set false + phi::funcs::SetConstant zero; + zero(dev_ctx, &dev_tensor, false); + + // find whether zero + phi::funcs::ForRange for_range(dev_ctx, numel); + FoundZeroFunctor functor(det->data(), numel, dev_tensor.data()); + for_range(functor); + + // copy to host + DenseTensor cpu_tensor; + phi::Copy(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor); + + // if founded zero, the matrix is not invertible + // else the matrix is invertible + auto* res = cpu_tensor.data(); + return !(*res); +} + +} // namespace detail + +template +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + auto input_dims_size = x.dims().size(); + if (input_dims_size > 2) { + PADDLE_ENFORCE_EQ( + out_grad.dims().size() + 2, + input_dims_size, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else if (input_dims_size == 2) { + // input dims size 2 and grad dims size 1 is possible + PADDLE_ENFORCE_EQ( + out_grad.dims().size(), + 1, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else { + // checked in forward, pass + } + + // Check Whether the matrix is invertible + // (matrix A not invertible) == (det(A)=0) + if (!detail::CheckMatrixInvertible(dev_ctx, &out)) { + // The matrix is not invertible + VLOG(3) << "The input matrix not invertible!"; + x_grad->Resize(x.dims()); + phi::Full( + dev_ctx, phi::vectorize(x.dims()), static_cast(0.0f), x_grad); + return; + } + + // The matrix is invertible + // let |A| = Determinant(A) + // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf + // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, + // -1) + + // First: inverse(A) + DenseTensor inverse_A; + // A must be square matrices! + inverse_A.Resize(x.dims()); + dev_ctx.template Alloc(&inverse_A); + + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, x, &inverse_A); + + VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); + + // Second: inverse(A).transpose(-2, -1) + DenseTensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " + << transpose_inverse_A.dims(); + + // Third: dA * |A| + auto mul_dA_detA = phi::Multiply(dev_ctx, out_grad, out); + VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); + + // Fourth: unsqueeze(dA * |A|, [-1, -2]) + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); + VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); + + // Finally: unsqueeze(dA * |A|) * inverse(A) + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); + + VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); + + x_grad->Resize(x.dims()); + VLOG(3) << "d|A| dims: " << x_grad->dims(); + + phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f3a611b89c95c64184a953e5069c8b200317da46 --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include +#include +#include +#include +#include + +#include "paddle/phi/core/enforce.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { +namespace detail { +template +class EigenMatrix {}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXf; +}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXd; +}; + +inline int64_t GetBatchCount(const DDim dims) { + int64_t batch_count = 1; + auto dim_size = dims.size(); + PADDLE_ENFORCE_GE( + dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + + // Cumulative multiplying each dimension until the last 2 to get the batch + // count, + // for example a tensor with shape [3,3,3,3], the batch count of matrices is + // 9. + for (int64_t i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + + return batch_count; +} +} // namespace detail + +template +struct DeterminantFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + int64_t rank, + int64_t batch_count, + DenseTensor* output) { + std::vector input_vec; + std::vector output_vec; + paddle::framework::TensorToVector(input, dev_ctx, &input_vec); + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * rank * rank; + auto end_iter = input_vec.begin() + (i + 1) * rank * rank; + std::vector sub_vec(begin_iter, + end_iter); // get every square matrix data + typename detail::EigenMatrix::MatrixType matrix(rank, rank); + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { + matrix(i, j) = sub_vec[rank * i + j]; + } + } + output_vec.push_back(matrix.determinant()); + } + paddle::framework::TensorFromVector(output_vec, output); + } +}; + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto input_dim = vectorize(x.dims()); + auto input_dim_size = input_dim.size(); + + auto batch_count = detail::GetBatchCount(x.dims()); + VLOG(10) << "input dim:" << x.dims(); + PADDLE_ENFORCE_GE( + input_dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], + input_dim[input_dim_size - 2], + phi::errors::InvalidArgument( + "the input matrix should be square matrix.")); + auto rank = input_dim[input_dim_size - 1]; // square matrix length + DeterminantFunctor()(dev_ctx, x, rank, batch_count, out); + auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2); + if (input_dim_size > 2) { + out->Resize(output_dims); + } else { + // when input is a two-dimension matrix, The det value is a number. + out->Resize({1}); + } + VLOG(10) << "output dim:" << out->dims(); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h index a1b33f5a331ba8add6159d0089ddfa602888bcdf..8fb1f1c4fa3615cbf33fb7b6e4b0609dbcc2c3a0 100644 --- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "paddle/phi/kernels/funcs/for_range.h" namespace phi { template diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h index 9b1e4b1d3a65d5c0da831a36152cff85a3353fa3..044adb0230cac4d0dc6bf9e9348968e4d7c60b5d 100644 --- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -21,12 +21,11 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, const auto H = dims[dims.size() - 2]; const auto W = dims[dims.size() - 1]; phi::funcs::ForRange x_for_range(dev_ctx, dx_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dx_bst.data(), unitriangular, !upper, H, W, dx_bst_upper.data()); x_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..dcc7224b5075ca77db813089af6048f0809c9f35 --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { + const auto* dout_data = out_grad.data(); + auto* dx_data = ctx.template Alloc(x_grad); + + const auto& dims = out_grad.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + + phi::funcs::ForRange for_range( + ctx, static_cast(out_grad.numel())); + phi::funcs::TrilTriuCompute tril_triu_grad_computer( + dout_data, diagonal, lower, H, W, dx_data); + for_range(tril_triu_grad_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..959169d87cefd877a4fb056218dd761a96f23136 --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { + const auto* x_data = x.data(); + auto* out_data = ctx.template Alloc(out); + + const auto& dims = x.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + phi::funcs::ForRange for_range(ctx, static_cast(x.numel())); + + phi::funcs::TrilTriuCompute tril_triu_computer( + x_data, diagonal, lower, H, W, out_data); + for_range(tril_triu_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c3dc1595989bf2879c3e20187eaa53b6df75a7f0 --- /dev/null +++ b/paddle/phi/kernels/index_select_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..124b6897311575223859fba882488a535a6310f4 --- /dev/null +++ b/paddle/phi/kernels/index_select_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output); + +} // namespace phi diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index a5d3f51e5447fa41447c4b59c3beb8c917f8a0e5..5aad2375ebb85a52684946fe35b2a5b17a0b9efd 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -19,27 +19,6 @@ namespace phi { -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); -} - template void AddKernel(const Context& dev_ctx, const DenseTensor& x, @@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx, using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; -PD_REGISTER_KERNEL( - mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} - -PD_REGISTER_KERNEL(sum, - CPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - PD_REGISTER_KERNEL(add, CPU, ALL_LAYOUT, @@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply, phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(mean, - GPU, - ALL_LAYOUT, - phi::MeanKernel, - float, - double, - bool, - int, - int64_t, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sum, - GPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} + PD_REGISTER_KERNEL(add, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index 7569cbcff087d796313c24d46ff7b7fd9cf7e2eb..ddc3a46e989f5cc86e294eb16ca0f82fcd7d8115 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -16,43 +16,8 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" - namespace phi { -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out); - template void AddRawKernel(const Context& dev_ctx, const DenseTensor& x, @@ -149,29 +114,4 @@ DenseTensor Multiply(const Context& dev_ctx, return dense_out; } -template -DenseTensor Mean(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); - MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); - return dense_out; -} - -template -DenseTensor Sum(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumInferMeta(x, axis, dtype, keep_dim, &meta_out); - SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); - return dense_out; -} - } // namespace phi diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ccde8c3648fa556401f1937c78039743daf43f4c --- /dev/null +++ b/paddle/phi/kernels/mode_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..831c4369304e5c5d27cddf01bcba021745bf7083 --- /dev/null +++ b/paddle/phi/kernels/mode_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); + +} // namespace phi diff --git a/paddle/phi/kernels/multiplex_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b32c9dbe100584f7076f34d848d3e5112315f83d --- /dev/null +++ b/paddle/phi/kernels/multiplex_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..341c6d5cabb7ce1d67090c7533bc8c45622f4786 --- /dev/null +++ b/paddle/phi/kernels/multiplex_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9c3dfb16601267ec8a1d2535f6854c2a31dba5a8 --- /dev/null +++ b/paddle/phi/kernels/qr_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc deleted file mode 100644 index 3cbd0976ad8d238be7462f20165c919df01a80ea..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/reduce_all_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_all_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} -#endif diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc deleted file mode 100644 index 371dd972129cc8fcf5f0e390f18749f8c5ad7f75..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/reduce_any_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_any_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#endif diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7638c782d547d6b69d0c740827abf96e3ffda0c5 --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); +} + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL( + mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(mean, + GPU, + ALL_LAYOUT, + phi::MeanKernel, + float, + double, + bool, + int, + int64_t, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..75f52c36beb76abcd0cc05a7b46935a56d35da64 --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.h @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out); + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +DenseTensor Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); + MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); + return dense_out; +} + +template +DenseTensor Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumInferMeta(x, axis, dtype, keep_dim, &meta_out); + SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); + return dense_out; +} + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc deleted file mode 100644 index de172a12d72884fb018acbb42c077efc825508ce..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_max_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#endif diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc deleted file mode 100644 index c8ec6b3678c58d38d19853c04128283d979f50de..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_min_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#endif diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..eea1fa03886a4a02dbc614052e1f280c2610f1ad --- /dev/null +++ b/paddle/phi/kernels/roi_align_grad_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h index 16b52c563a592f0cc23ddca94f554f5dc49e8ccf..9734da53b7f453d492cc60ee8930f54e7ca74edc 100644 --- a/paddle/phi/kernels/roi_align_kernel.h +++ b/paddle/phi/kernels/roi_align_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..331f3626e56574615a2d6b1680335638b060846d --- /dev/null +++ b/paddle/phi/kernels/roll_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..56f32174a4c0005968acf147b2daf25914ff01b1 --- /dev/null +++ b/paddle/phi/kernels/roll_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc index 9bcd5d8544e2d73961d72115023446d427e8895e..67126d82042b28de8c560a55046e50029153290d 100644 --- a/paddle/phi/kernels/selected_rows/shape_kernel.cc +++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/shape_kernel.h" namespace phi { namespace sr { @@ -25,15 +26,7 @@ template void ShapeKernel(const Context& ctx, const SelectedRows& input, DenseTensor* out) { - auto in_var = input; - phi::DDim in_dims; - in_dims = in_var.value().dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } + phi::ShapeKernel(ctx, input.value(), out); } } // namespace sr diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/shape_kernel.cc similarity index 53% rename from paddle/phi/kernels/gpu/shape_kernel.cu rename to paddle/phi/kernels/shape_kernel.cc index 39b6eaeaef2a8e80d204941dc1f3ac92907aa786..dd26a7edc9cdd8e1917bb5d88e957b3e7d545f93 100644 --- a/paddle/phi/kernels/gpu/shape_kernel.cu +++ b/paddle/phi/kernels/shape_kernel.cc @@ -13,12 +13,43 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/shape_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" +namespace phi { + +template +void ShapeKernel(const Context& ctx, + const DenseTensor& input, + DenseTensor* out) { + auto in_var = &input; + phi::DDim in_dims; + in_dims = in_var->dims(); + auto out_t = out; + out_t->Resize({in_dims.size()}); + auto out_data = ctx.template HostAlloc(out_t); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(shape, + CPU, + ALL_LAYOUT, + phi::ShapeKernel, + bool, + int, + int8_t, + uint8_t, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(shape, GPU, ALL_LAYOUT, @@ -33,3 +64,4 @@ PD_REGISTER_KERNEL(shape, phi::dtype::complex, phi::dtype::complex, phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 42bde442e1e063a355d2eabb2963865a2ff45bcb..23e059c72e77615e2c24aed961d22b3154c30449 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -41,7 +41,7 @@ std::vector Conv3dGrad(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 64c32df18971c4d66873b02a61220a5bed8db005..93a335e2f1c35700d2bf5ef54400c52ed54f6be2 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D; template void ProductRuleBook(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& kernel, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx, const bool subm, DenseTensor* rulebook, DenseTensor* counter_per_kernel) { - const auto& kernel_dims = kernel.dims(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); int* counter_ptr = counter_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; memset(counter_ptr, 0, kernel_size * sizeof(int)); int rulebook_len = 0; // calc the rulebook_len const auto& x_dims = x.dims(); const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); + const Dims4D c_kernel_dims( + 1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]); const Dims4D c_strides(1, strides[2], strides[1], strides[0]); @@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx, auto f_calc_rulebook = [&](int* rulebook_ptr) { int kernel_index = 0, rulebook_index = 0; - for (int kz = 0; kz < kernel_dims[0]; kz++) { - for (int ky = 0; ky < kernel_dims[1]; ky++) { - for (int kx = 0; kx < kernel_dims[2]; kx++) { + for (int kz = 0; kz < kernel_sizes[0]; kz++) { + for (int ky = 0; ky < kernel_sizes[1]; ky++) { + for (int kx = 0; kx < kernel_sizes[2]; kx++) { ++kernel_index; for (int64_t i = 0; i < non_zero_num; i++) { int batch = indices_ptr[i]; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 5d7b381b7cb0beef7e69608ce7f732d8cdf9d222..3348d81cf6b4bbffe7f6db24dbe12fef24cadf40 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx, rulebook_len, in_channels, in_features_ptr); - Gather(out_grad.non_zero_elements().data(), + Gather(out_grad.data(), rulebook_ptr + rulebook_len * 2, rulebook_len, out_channels, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 746ca04a826c020201e53803dd2ec83519cf576e..f022e4ef4bb63617018d6e6ecdf2560b72dead3a 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } + phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; @@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx, ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 8826fd7cf87e0a7a4a8251b4da823f18190f4a38..5b928817f64d748ec824a2c28e569181034d1072 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -23,11 +23,15 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace // this kernel with phi::GatherCUDAKernel; // Vectorization can be used to improve read and write bandwidth @@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx, return new_end.first; } +template +__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, + const int n, + const int rulebook_len, + const int kernel_size, + T* rulebook_ptr, + int* counter_ptr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int cache_count[]; // kernel_size + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_count[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + int index = indexs[i]; + int kernel_index = rulebook_ptr[index]; + rulebook_ptr[index + rulebook_len] = -1; + rulebook_ptr[index + 2 * rulebook_len] = -1; + rulebook_ptr[index] = -1; + atomicAdd(&cache_count[kernel_index], 1); + } + __syncthreads(); + + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicSub(&counter_ptr[i], cache_count[i]); + } +} + +/** + * @brief: update the out index and indices + * unique_keys: save the index of the output feature list + * unique_values: indiates the index of key before deduplication + * out_indexs: indicates the position of the output index in the rulebook + * rulebook_len: indicates the length of rulebook + * out_dims: indicates the output dims + * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) + * rulebook_out_indexs: the output index in rulebook +**/ +template +__global__ void UpdateIndexKernel(const int* unique_keys, + const int* unique_values, + const int* out_indexs, + const int non_zero_num, + const int rulebook_len, + const Dims4D out_dims, + T* out_indices, + T* rulebook_out_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + const int index = unique_keys[i]; + int batch, x, y, z; + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + + // update rulebook + int start = unique_values[i]; + int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; + // max(end-start) = kernel_size + for (int j = start; j < end; j++) { + rulebook_out_indexs[out_indexs[j]] = i; + } + } +} + +// brief: calculation the distance between start and end +template +__global__ void DistanceKernel(const T* start, const T* end, int* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +/** + * @brief product rulebook + * for input_i in x_indices: + * if input_i participate in the convolution calculation: + * infer the output_i by input_i and kernel_i + * save output_i + * + * x_indices: the indices of input features + * x_dims: the input dims + * kernel_dims: the kernel dims + * out_dims: the output dims + * non_zero_num: the number of input features + * rulebook: the rulebook to save the kernel index, input index and output index + * counter: save the number of times each location in the kernel participates in + *the caculation +**/ +template +__global__ void ProductRuleBookKernel(const T* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + const bool subm, + T* rulebook, + int* counter, + int* in_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int counter_buf[]; // kernel_size + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + if (subm) { + in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); + } + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int in_i = -1, out_index = -1, kernel_i = -1; + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + in_i = i; + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); + atomicAdd(&counter_buf[kernel_index], 1); + kernel_i = kernel_index; + } + rulebook[kernel_index * non_zero_num + i] = kernel_i; + rulebook[kernel_index * non_zero_num + offset + i] = in_i; + rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicAdd(&counter[i], counter_buf[i]); + } +} + +// the basic algorithm can refer to convolution_kernel.cc or +// the second paper +// example: +// 1. the rulebook: +// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... +// the out_index(key): 20, 30, 33, 30, 33, 20, 25 +// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... +// 3. sorted the (key, value) +// 4. unique the (key, value): +// unique_key: 20, 25, 30, 33 +// unique_values: 0, 2, 3, 5 +// the index of unique_values is: 0, 1, 2, 3 +// 5. update the out_index by unique_key, uniqe_value and the index of +// unique_value: +// the new out_index: 0, 2, 3, 2, 3, 0, 1 +template +int ProductRuleBook(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const DDim& out_dims, + const bool subm, + DenseTensor* rulebook, + DenseTensor* counter_per_kernel, + DenseTensor* offsets_per_kernel, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value, + SparseCooTensor* out, + std::vector* h_counter, + std::vector* h_offsets) { + const int64_t non_zero_num = x.nnz(); + const auto& non_zero_indices = x.non_zero_indices(); + const int* indices_ptr = non_zero_indices.data(); + DenseTensor in_indexs = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + int* counter_ptr = counter_per_kernel->data(); + int* offsets_ptr = offsets_per_kernel->data(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); + int* rulebook_ptr = rulebook->data(); + + const auto x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); + Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); + Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); + Dims4D d_strides(1, strides[2], strides[1], strides[0]); + Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); + + // 1. product rule book + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, counter_per_kernel, 0); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + subm, + rulebook_ptr, + counter_ptr, + in_indexs.data()); + +// 2. remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + rulebook_rows * rulebook_cols, + -1); + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + int rulebook_len = 0; + phi::backends::gpu::GpuMemcpyAsync( + &rulebook_len, + rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + rulebook_len /= 3; + dev_ctx.Wait(); + + if (subm) { + // At present, hashtable is not used to map the input and output indexes. + // At present, the intermediate output index is generated by normal + // convolution, + // and then the intermediate output index is subtracted from the input index + // to obain the rulebook. + // get difference + int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; + int32_t* B_key_ptr = in_indexs.data(); + DenseTensor A_val = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor B_val = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + phi::IndexKernel>( + dev_ctx, &A_val, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, &B_val, kps::IdentityFunctor()); + DenseTensor key_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); + DenseTensor val_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + std::vector offsets(kernel_size, 0); + // TODO(zhangkaihuo): used unified memcpy interface + phi::backends::gpu::GpuMemcpyAsync(offsets.data(), + offsets_ptr, + kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + + thrust::pair end; + // Because set_diff does not support duplicate data, set_diff is performed + // separately for each segment of data. + // TODO(zhangkaihuo): Using hashtable here may get better performance, + // further tests ared needed. + for (int i = 0; i < kernel_size; i++) { + int start = offsets[i]; + int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; + int* key_result_start = (i == 0 ? key_result.data() : end.first); + int* val_result_start = i == 0 ? val_result.data() : end.second; + end = +#ifdef PADDLE_WITH_HIP + thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + A_key_ptr + start, + A_key_ptr + stop, + B_key_ptr, + B_key_ptr + x.nnz(), + A_val.data() + start, + B_val.data(), + key_result_start, + val_result_start); + } + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + key_result.data(), + end.first, + key_result.data() + rulebook_len); + int len = 0; + phi::backends::gpu::GpuMemcpyAsync(&len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + // set the diff value = -1, and update counter + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); + SetFlagAndUpdateCounterKernel<<>>( + val_result.data(), + len, + rulebook_len, + kernel_size, + rulebook_ptr, + counter_ptr); +// remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 3 * rulebook_len, + -1); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, key_result.data() + rulebook_len); + phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + rulebook_len /= 3; + } + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + rulebook->Resize({rulebook_rows, rulebook_len}); + + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({rulebook_len}); + unique_value->ResizeAndAllocate({rulebook_len}); + unique_key->ResizeAndAllocate({rulebook_len}); + int* out_index_ptr = out_index->data(); + int* unique_value_ptr = unique_value->data(); + int* unique_key_ptr = unique_key->data(); + + int* new_end = SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index, + unique_key, + unique_value); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1>>>( + unique_key_ptr, + new_end, + rulebook_ptr + rulebook_rows * rulebook_cols - 1); + int out_non_zero_num = 0; +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + int* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>(unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + 2 * rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d6d992d0f4b651b1a9a47cdddcae116a215a0e57..4db0a0b0011b5a664b66d54f6d42f2e1954ccd12 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -140,12 +140,11 @@ void Conv3dGradKernel(const Context& dev_ctx, GatherKernel<<>>( - out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + dev_ctx.stream()>>>(out_grad.data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 1a0c7e9b972145fbc98cdb9dfee0267a9eaa9f90..214e689e9370a313e66be0281db177407d7b87f0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include - -#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/index_impl.cu.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" namespace phi { namespace sparse { -using Dims4D = phi::funcs::sparse::Dims4D; - -__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, - const int n, - const int rulebook_len, - const int kernel_size, - int* rulebook_ptr, - int* counter_ptr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int cache_count[]; // kernel_size - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - cache_count[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += gridDim.x * blockDim.x) { - int index = indexs[i]; - int kernel_index = rulebook_ptr[index]; - rulebook_ptr[index + rulebook_len] = -1; - rulebook_ptr[index + 2 * rulebook_len] = -1; - rulebook_ptr[index] = -1; - atomicAdd(&cache_count[kernel_index], 1); - } - __syncthreads(); - - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicSub(&counter_ptr[i], cache_count[i]); - } -} - -/** - * @brief: update the out index and indices - * unique_keys: save the index of the output feature list - * unique_values: indiates the index of key before deduplication - * out_indexs: indicates the position of the output index in the rulebook - * rulebook_len: indicates the length of rulebook - * out_dims: indicates the output dims - * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) - * rulebook_out_indexs: the output index in rulebook -**/ -__global__ void UpdateIndexKernel(const int* unique_keys, - const int* unique_values, - const int* out_indexs, - const int non_zero_num, - const int rulebook_len, - const Dims4D out_dims, - int* out_indices, - int* rulebook_out_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - const int index = unique_keys[i]; - int batch, x, y, z; - phi::funcs::sparse::IndexToPoint( - index, out_dims, &batch, &x, &y, &z); - // get out indices - out_indices[i] = batch; - out_indices[i + non_zero_num] = z; - out_indices[i + non_zero_num * 2] = y; - out_indices[i + non_zero_num * 3] = x; - - // update rulebook - int start = unique_values[i]; - int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; - // max(end-start) = kernel_size - for (int j = start; j < end; j++) { - rulebook_out_indexs[out_indexs[j]] = i; - } - } -} - -/** - * @brief product rulebook - * for input_i in x_indices: - * if input_i participate in the convolution calculation: - * infer the output_i by input_i and kernel_i - * save output_i - * - * x_indices: the indices of input features - * x_dims: the input dims - * kernel_dims: the kernel dims - * out_dims: the output dims - * non_zero_num: the number of input features - * rulebook: the rulebook to save the kernel index, input index and output index - * counter: save the number of times each location in the kernel participates in - *the caculation -**/ -__global__ void ProductRuleBookKernel(const int* x_indices, - const Dims4D x_dims, - const Dims4D kernel_dims, - const Dims4D out_dims, - const int64_t non_zero_num, - const Dims4D paddings, - const Dims4D dilations, - const Dims4D strides, - const bool subm, - int* rulebook, - int* counter, - int* in_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int counter_buf[]; // kernel_size - const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; - const int offset = kernel_size * non_zero_num; - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - counter_buf[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - int kernel_index = 0; - int batch = x_indices[i]; - int in_z = x_indices[i + non_zero_num]; - int in_y = x_indices[i + 2 * non_zero_num]; - int in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } - for (int kz = 0; kz < kernel_dims[1]; kz++) { - for (int ky = 0; ky < kernel_dims[2]; ky++) { - for (int kx = 0; kx < kernel_dims[3]; kx++) { - int in_i = -1, out_index = -1, kernel_i = -1; - if (phi::funcs::sparse::Check(x_dims, - kernel_dims, - paddings, - dilations, - strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { - int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; - int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; - int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; - in_i = i; - out_index = phi::funcs::sparse::PointToIndex( - batch, out_x, out_y, out_z, out_dims); - atomicAdd(&counter_buf[kernel_index], 1); - kernel_i = kernel_index; - } - rulebook[kernel_index * non_zero_num + i] = kernel_i; - rulebook[kernel_index * non_zero_num + offset + i] = in_i; - rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; - ++kernel_index; - } - } - } - } - __syncthreads(); - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicAdd(&counter[i], counter_buf[i]); - } -} - -// brief: calculation the distance between start and end -__global__ void DistanceKernel(const int* start, - const int* end, - int* distance) { - if (threadIdx.x == 0) { - *distance = end - start; - } -} - -// the basic algorithm can refer to convolution_kernel.cc or -// the second paper -// example: -// 1. the rulebook: -// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... -// the out_index(key): 20, 30, 33, 30, 33, 20, 25 -// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... -// 3. sorted the (key, value) -// 4. unique the (key, value): -// unique_key: 20, 25, 30, 33 -// unique_values: 0, 2, 3, 5 -// the index of unique_values is: 0, 1, 2, 3 -// 5. update the out_index by unique_key, uniqe_value and the index of -// unique_value: -// the new out_index: 0, 2, 3, 2, 3, 0, 1 -template -int ProductRuleBook(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const DDim& out_dims, - const bool subm, - DenseTensor* rulebook, - DenseTensor* counter_per_kernel, - DenseTensor* offsets_per_kernel, - DenseTensor* out_index, - DenseTensor* unique_key, - DenseTensor* unique_value, - SparseCooTensor* out, - std::vector* h_counter, - std::vector* h_offsets) { - const auto& kernel_dims = kernel.dims(); - const int64_t non_zero_num = x.nnz(); - const auto& non_zero_indices = x.non_zero_indices(); - const int* indices_ptr = non_zero_indices.data(); - DenseTensor in_indexs = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - int* counter_ptr = counter_per_kernel->data(); - int* offsets_ptr = offsets_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; - const int rulebook_rows = 3; - const int rulebook_cols = kernel_size * non_zero_num; - rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); - int* rulebook_ptr = rulebook->data(); - - const auto x_dims = x.dims(); - Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); - Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); - Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); - Dims4D d_strides(1, strides[2], strides[1], strides[0]); - Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); - - // 1. product rule book - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, counter_per_kernel, 0); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - - ProductRuleBookKernel<<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - rulebook_ptr, - counter_ptr, - in_indexs.data()); - -// 2. remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + rulebook_rows * rulebook_cols, - -1); - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); - int rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( - &rulebook_len, - rulebook_ptr + 3 * kernel_size * non_zero_num - 1, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - rulebook_len /= 3; - dev_ctx.Wait(); - - if (subm) { - // At present, hashtable is not used to map the input and output indexes. - // At present, the intermediate output index is generated by normal - // convolution, - // and then the intermediate output index is subtracted from the input index - // to obain the rulebook. - // get difference - int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; - int32_t* B_key_ptr = in_indexs.data(); - DenseTensor A_val = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - DenseTensor B_val = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - phi::IndexKernel>( - dev_ctx, &A_val, kps::IdentityFunctor()); - phi::IndexKernel>( - dev_ctx, &B_val, kps::IdentityFunctor()); - DenseTensor key_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); - DenseTensor val_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - std::vector offsets(kernel_size, 0); - // TODO(zhangkaihuo): used unified memcpy interface - phi::backends::gpu::GpuMemcpyAsync(offsets.data(), - offsets_ptr, - kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - - thrust::pair end; - // Because set_diff does not support duplicate data, set_diff is performed - // separately for each segment of data. - // TODO(zhangkaihuo): Using hashtable here may get better performance, - // further tests ared needed. - for (int i = 0; i < kernel_size; i++) { - int start = offsets[i]; - int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; - int* key_result_start = (i == 0 ? key_result.data() : end.first); - int* val_result_start = i == 0 ? val_result.data() : end.second; - end = -#ifdef PADDLE_WITH_HIP - thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - A_key_ptr + start, - A_key_ptr + stop, - B_key_ptr, - B_key_ptr + x.nnz(), - A_val.data() + start, - B_val.data(), - key_result_start, - val_result_start); - } - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - key_result.data(), - end.first, - key_result.data() + rulebook_len); - int len = 0; - phi::backends::gpu::GpuMemcpyAsync(&len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - // set the diff value = -1, and update counter - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); - SetFlagAndUpdateCounterKernel<<>>(val_result.data(), - len, - rulebook_len, - kernel_size, - rulebook_ptr, - counter_ptr); -// remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + 3 * rulebook_len, - -1); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, key_result.data() + rulebook_len); - phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - rulebook_len /= 3; - } - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - rulebook->Resize({rulebook_rows, rulebook_len}); - - // 3. sorted or merge the out index - out_index->ResizeAndAllocate({rulebook_len}); - unique_value->ResizeAndAllocate({rulebook_len}); - unique_key->ResizeAndAllocate({rulebook_len}); - int* out_index_ptr = out_index->data(); - int* unique_value_ptr = unique_value->data(); - int* unique_key_ptr = unique_key->data(); - - int* new_end = SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + 2 * rulebook_len, - rulebook_len, - out_index, - unique_key, - unique_value); - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - DistanceKernel<<<1, 1>>>(unique_key_ptr, - new_end, - rulebook_ptr + rulebook_rows * rulebook_cols - 1); - int out_non_zero_num = 0; -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - dev_ctx.Wait(); - - // 5. update out_indices and rulebook by unique_value_ptr - const int64_t sparse_dim = 4; - DenseTensorMeta indices_meta( - DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout()); - phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); - phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - - int* out_indices_ptr = out_indices.data(); - - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); - UpdateIndexKernel<<>>(unique_key_ptr, - unique_value_ptr, - out_index_ptr, - out_non_zero_num, - rulebook_len, - d_out_dims, - out_indices_ptr, - rulebook_ptr + 2 * rulebook_len); - out->SetMember(out_indices, out_values, out_dims, true); - return rulebook_len; -} - /** * x: (N, D, H, W, C) * kernel: (D, H, W, C, OC) @@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); - out->set_dims(out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; std::vector offsets(kernel_size + 1), h_counter(kernel_size); @@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx, int n = ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..10faf5c48d5bffab9f5199ebeefe7d5a2267ecea --- /dev/null +++ b/paddle/phi/kernels/tril_triu_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4daa84e25c373d6bd5a26f2682385921dc2ce880 --- /dev/null +++ b/paddle/phi/kernels/tril_triu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index cbfca5b17ae995a89360c6d6d4987028d95dc281..890dbadf17c81fa40f629114df47f518fdcc387b 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,45 +16,49 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"X", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"Out", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } #define comma , -DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT -DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT -DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT -DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT -DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT -DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT -DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT -DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT -DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT -DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT -DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT -DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT -DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT -DefineActGradDepXOpArgMap(ThresholdedRelu, - "thresholded_relu", - "threshold"); // NOLINT - -DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT -DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, + "thresholded_relu", + "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT + +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { @@ -85,11 +89,31 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); } +KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); +} + +KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu_grad", + {"X", "Out", GradVarName("Out")}, + {"alpha"}, + {GradVarName("X")}); +} + +KernelSignature EluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); +PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); +PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -118,3 +142,13 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, phi::LeakyReluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, phi::ThresholdedReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, + phi::SoftShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, + phi::HardShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, + phi::TanhShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..7bcd30ec5d79b9e137c3dc3fa38f0498e9fe01de --- /dev/null +++ b/paddle/phi/ops/compat/determinant_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeterminantGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("determinant_grad", + {"Input", "Out", GradVarName("Out")}, + {}, + {GradVarName("Input")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(determinant_grad, + phi::DeterminantGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c47bbe48b8ee18527cfef41fad3488bef6c1dd9 --- /dev/null +++ b/paddle/phi/ops/compat/gather_sig.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"}); + } else { + return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"}); + } +} + +KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"Axis", "overwrite"}, + {GradVarName("X")}); + } else { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"axis", "overwrite"}, + {GradVarName("X")}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b76a9770d4dede5ea604f69858201c2fb035070d --- /dev/null +++ b/paddle/phi/ops/compat/grid_sampler_sig.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GridSamplerOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample", + {"X", "Grid"}, + {"mode", "padding_mode", "align_corners"}, + {"Output"}); +} + +KernelSignature GridSamplerGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample_grad", + {"X", "Grid", GradVarName("Output")}, + {"mode", "padding_mode", "align_corners"}, + {GradVarName("X"), GradVarName("Grid")}); +} + +} // namespace phi + +// use Python API name as kernel name +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); + +PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, + phi::GridSamplerGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/index_select_sig.cc b/paddle/phi/ops/compat/index_select_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..53eff1bbcd7ed5269299ccfe41631a699e3d0a32 --- /dev/null +++ b/paddle/phi/ops/compat/index_select_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IndexSelectGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("index_select_grad", + {"X", "Index", GradVarName("Out")}, + {"dim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(index_select_grad, + phi::IndexSelectGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..20994c08aa73c33328568e334d258c44eef68171 --- /dev/null +++ b/paddle/phi/ops/compat/mode_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); +} + +KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mode_grad", + {"X", "Indices", GradVarName("Out")}, + {"axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dab4655d172312a7389d0bb243e31ee39ef5981 --- /dev/null +++ b/paddle/phi/ops/compat/multiplex_sig.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"}); +} + +KernelSignature MultiplexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping); diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/ops/compat/qr_sig.cc similarity index 57% rename from paddle/phi/kernels/impl/shape_kernel_impl.h rename to paddle/phi/ops/compat/qr_sig.cc index 982cfb33f6b14fc14c7c58ff8c4548a4cdbd3b3b..dd424d590ee113adfab0e9643c3c7ffc519f86e4 100644 --- a/paddle/phi/kernels/impl/shape_kernel_impl.h +++ b/paddle/phi/ops/compat/qr_sig.cc @@ -12,25 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - -#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/compat/op_utils.h" namespace phi { -template -void ShapeKernel(const Context& ctx, - const DenseTensor& input, - DenseTensor* out) { - auto in_var = &input; - phi::DDim in_dims; - in_dims = in_var->dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } +KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"}); } } // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping); diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index dcb00fe1b0cceb978ad24eda10ef78e339642d75..789496ccbd01c12504e1aeb9f89b60bf94a091c9 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -52,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); } KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { @@ -107,10 +118,6 @@ KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("X")) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); - // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in - // InferShape, so we must return the "all_raw" KernelSignature. - // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the "all_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -135,6 +142,7 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod); PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc index 0549103b6fbcb8b2367c34c8a44fb3b52f318859..1717ec8f788091fc5eae59c40a32a30c355760e8 100644 --- a/paddle/phi/ops/compat/roi_align_sig.cc +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -16,7 +16,7 @@ namespace phi { -KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { +KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("roi_align", {"X", "ROIs", "RoisNum"}, {"pooled_height", @@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { {"Out"}); } +KernelSignature RoiAlignGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("roi_align_grad", + {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"pooled_height", + "pooled_width", + "spatial_scale", + "sampling_ratio", + "aligned"}, + {GradVarName("X")}); +} + } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a144f0e8e8a90eee0bf0a8a80455b1e19611880c --- /dev/null +++ b/paddle/phi/ops/compat/roll_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("ShiftsTensor")) { + return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"}); + } + return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"}); +} + +KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roll_grad", + {"X", GradVarName("Out")}, + {"shifts", "axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc index 49a6d02225d931f1dc2d3324cb13c2c620f5dfe6..ca3fa5fe1f86ac13252c04c05c0508c47feded42 100644 --- a/paddle/phi/ops/compat/tile_sig.cc +++ b/paddle/phi/ops/compat/tile_sig.cc @@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("RepeatTimes")) { return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"}); } else if (ctx.InputSize("repeat_times_tensor") > 0) { + const auto& repeat_times = + paddle::any_cast>(ctx.Attr("repeat_times")); + if (!ctx.IsRuntime() && !repeat_times.empty()) { + return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); + } return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"}); } else { return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f79f8650decfc6556287be2caefa6d1074ecf7f --- /dev/null +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); +} + +KernelSignature TrilTriuGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu_grad", + {GradVarName("Out")}, + {"diagonal", "lower"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc index 23edfeacaf81436d6381be674c72a27ae96e0b41..ce31b2021e01a4130038e7e26bc37fd3e13ef27a 100644 --- a/paddle/phi/tests/kernels/test_mean_dev_api.cc +++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 37a69a176c6e1ded81a8449da3c571442bd94e78..4800e1402ba56f2956c207f44f2656a71d50b92c 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -132,16 +132,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(out.non_zero_elements().data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_cpu, - x_tensor, - rulebook, - kernel_tensor, - out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_cpu, + x_tensor, + rulebook, + kernel_tensor, + out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); f_verify(grads[0].data(), features_grad); f_verify(grads[1].data(), kernel_grad); } @@ -231,16 +232,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(h_features_tensor.data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_gpu, - d_x_tensor, - d_rulebook, - d_kernel_tensor, - d_out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_kernel_tensor, + d_out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); DenseTensor h_features_grad = phi::Empty( dev_ctx_cpu, DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc index dfec291bc072f023dd09dba768cdeeb6e4cc3a34..82fa90c1574bd5c358d9e2325349811d43f5d973 100644 --- a/paddle/phi/tests/kernels/test_sum_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 76b45ff89f1869cc5e401b5c1b4151ad14158259..850d4015abf7a8164add9d4896d5a9bdfa26989d 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -44,6 +44,8 @@ function update_pd_ops() { cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py + # generate test model + python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } function init() { @@ -93,7 +95,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index ae2d9163435b906f17e9b28a680302d2bd305bbc..e303ce1216822b26bb58813c37239ae3e3fec043 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -21,11 +21,12 @@ from paddle.fluid import framework from .utils import print_program_with_dist_attr from .operators import find_best_compatible_distributed_operator_impl -from .dist_context import get_default_distributed_context +from .dist_context import get_default_distributed_context, _node_id from .dist_tensor import DistributedTensor from .dist_op import DistributedOperator from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute +from .process_mesh import ProcessMesh from paddle.distributed.fleet.meta_optimizers.common import OpRole @@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list): return compatible_result +def merge_process_mesh_two(pm1, pm2): + process_set1 = set() + process_set2 = set() + if pm1 is None and pm2 is None: + return None + if pm1 is not None: + process_set1 = set(pm1.processes) + if pm2 is not None: + process_set2 = set(pm2.processes) + merged_process_set = process_set1.union(process_set2) + merged_process_mesh = ProcessMesh(list(merged_process_set)) + return merged_process_mesh + + class Completer: def __init__(self, dist_context): assert dist_context is not None @@ -119,7 +134,9 @@ class Completer: return False tensor_desc = tensor_node.var() # Skip reader tensor - if tensor_desc.type() == core.VarDesc.VarType.READER: + if tensor_desc.type() == core.VarDesc.VarType.READER \ + or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES: return False tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( tensor_node) @@ -185,7 +202,7 @@ class Completer: op_dist_attr = dist_op.dist_attr if fwd: for tensor_node in op_node.inputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -208,19 +225,19 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=True) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx else: for tensor_node in op_node.outputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -243,61 +260,38 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=False) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx return changed - def _update_process_mesh(self): - def _find_nearset_node(nodes, idx): - for node in reversed(nodes[:idx]): - node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - if node_dist_attr.process_mesh is not None: - return node - - total_reach_fix_point = False - while not total_reach_fix_point: - total_changed = False - for is_fwd in [True, False]: - all_nodes = self._dist_context.serial_ordered_nodes \ - if is_fwd else reversed(self._dist_context.serial_ordered_nodes) - reach_fix_point = False - while not reach_fix_point: - changed = False - for idx, node in enumerate(all_nodes): - nearest_node = _find_nearset_node( - self._dist_context.serial_ordered_nodes, idx) - if nearest_node is None: - continue - nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph( - nearest_node) - nearest_process_mesh = nearest_node_dis_attr.process_mesh - cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - cur_process_mesh = cur_node_dist_attr.process_mesh - compatible_process_mesh = compute_compatible_process_mesh( - [cur_process_mesh, nearest_process_mesh]) - if compatible_process_mesh is not None \ - and cur_process_mesh != compatible_process_mesh: - cur_node_dist_attr.process_mesh = compatible_process_mesh - changed = True - if changed: - reach_fix_point = False - total_changed = True - else: - reach_fix_point = True - if total_changed: - total_reach_fix_point = False - else: - total_reach_fix_point = True + def _update_dims_mapping_between_graphs(self): + changed = False + for parent_node, child_node in self._node_pairs_between_graphs: + parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + parent_node) + child_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + child_node) + parent_node_dims_mapping = parent_node_dist_attr.dims_mapping + child_node_dims_mapping = child_node_dist_attr.dims_mapping + compatible_dims_mapping = compute_compatible_dims_mapping( + [parent_node_dims_mapping, child_node_dims_mapping]) + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != parent_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != child_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + return changed def _update_dims_mapping(self): # Complete dims_mapping for each node @@ -318,11 +312,314 @@ class Completer: node, fwd=is_fwd) if op_changed: changed = True + graph_changed = self._update_dims_mapping_between_graphs() + if graph_changed: + changed = True if changed: reach_fix_point = False else: reach_fix_point = True + def _update_process_mesh_by_nearest(self, op_node, nearest_op_node): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + # Set the process mesh of the op node by its nearest op node + if not op_dist_attr.is_annotated("process_mesh"): + process_mesh = op_dist_attr.process_mesh + nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + nearest_process_mesh = nearest_op_dis_attr.process_mesh + compatible_process_mesh = compute_compatible_process_mesh( + [process_mesh, nearest_process_mesh]) + if compatible_process_mesh is not None \ + and process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + # Skip the process_mesh setting of inputs and outputs of while_op + if op_dist_attr.op_type == "while": + return + # Set the process mesh of the op node's leaf-inputs + for tensor_node in op_node.inputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + # Skip the non-leaf var node + if len(tensor_node.inputs) != 0: + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + # Set the process mesh of the op node's outputs + for tensor_node in op_node.outputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + + def _update_process_mesh_for_specials(self): + def _find_nearest_tensor_node_before(nodes, idx, var_name): + for node in reversed(nodes[:idx]): + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nearest_tensor_node_after(nodes, idx, var_name): + for node in nodes[idx + 1:]: + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nodes_related_to_cond(source_node): + related_nodes = [] + visited = set() + frontier = list() + frontier.append(source_node) + # BFS + while len(frontier) != 0: + cur = frontier[0] + frontier = frontier[1:] + if _node_id(cur) in visited: + continue + # TODO: need more restrictions + for node in cur.inputs: + if node.is_var() and node.var() is not None: + if node.var().type() != core.VarDesc.VarType.READER \ + and len(node.var().shape()) == 1: + frontier.append(node) + related_nodes.append(node) + if node.is_op() and node.op() is not None: + flag = True + if node.op().type() == "create_py_reader" \ + or node.op().type() == "create_double_buffer_reader" \ + or node.op().type() == "read": + flag = False + for tensor_node in node.inputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + for tensor_node in node.outputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + if flag: + frontier.append(node) + related_nodes.append(node) + visited.add(_node_id(cur)) + return related_nodes + + # Amend the process meshes related to while_op + for while_op_node, while_op_node_idx in self._while_op_nodes.values(): + sub_graph_id = while_op_node.op()._block_attr_id("sub_block") + sub_graph = self._dist_context._serial_graph.get_sub_graph( + sub_graph_id) + sub_graph_nodes = list(sub_graph.all_nodes()) + while_dist_op = self._dist_context.get_dist_op_for_graph( + while_op_node) + while_op_dist_attr = while_dist_op.dist_attr + + # Step 1: set the process mesh of while_op to the merged process mesh of its subblock + merged_process_mesh = while_op_dist_attr.process_mesh + for node in sub_graph_nodes: + if (node.is_var() and node.var() is not None) \ + or (node.is_op() and node.op() is not None): + dist_attr = self._dist_context.get_dist_attr_for_graph(node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + while_op_dist_attr.process_mesh = merged_process_mesh + + # Step 2: set the related nodes of while_op to the process mesh of while_op + # Step 2.1: Find related nodes of cond var the graph of while_op + cond_tensor_related_nodes = [] + cond_tensor_name = while_op_node.op().input("Condition")[0] + cond_tensor_node = None + for node in while_op_node.inputs: + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name: + cond_tensor_node = node + cond_tensor_related_nodes.append(cond_tensor_node) + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + + # Step 2.2: Find related nodes of cond var in the subgraph of while_op + cond_tensor_node = None + for node in reversed(sub_graph_nodes): + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name \ + and len(node.outputs) == 0: + cond_tensor_node = node + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + # Step 2.3: Add the StepScops output of while_op + stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0] + stepscopes_tensor_node = None + for output_node in while_op_node.outputs: + if output_node.is_var() and output_node.var() is not None \ + and output_node.var().name() == stepscopes_tensor_name: + stepscopes_tensor_node = output_node + cond_tensor_related_nodes.append(stepscopes_tensor_node) + # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op + for node in cond_tensor_related_nodes: + tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + node) + tensor_dist_attr.process_mesh = merged_process_mesh + + # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes + while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes + while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + if nearest_tensor_node is None: + nearest_tensor_node = _find_nearest_tensor_node_after( + self._dist_context.serial_ordered_nodes, + while_op_node_idx, tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Amend the process meshes related to array + for array_node_list in self._array_nodes.values(): + merged_process_mesh = None + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + dist_attr.process_mesh = merged_process_mesh + + def _update_process_mesh(self): + ordered_op_nodes = self._dist_context._serial_ordered_op_nodes + + # Step 1: Set the annotated process meshes from tensors to the first ops using them + ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes + for tensor_node in ordered_tensor_nodes: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if not tensor_dist_attr.is_annotated("process_mesh"): + continue + first_op_node = None + for op_node in ordered_op_nodes: + # TODO: Need a better rule for the control flow ops. + # For now, do not set the process mesh of while_op from its inputs + if op_node.op().type() == "while": + continue + for input_tensor_node in op_node.inputs: + if _node_id(tensor_node) == _node_id(input_tensor_node): + first_op_node = op_node + break + if first_op_node is not None: + break + if first_op_node is None: + continue + op_dist_attr = self._dist_context.get_dist_attr_for_graph( + first_op_node) + if op_dist_attr is not None and not op_dist_attr.is_annotated( + "process_mesh"): + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and op_dist_attr.process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + + # Step 2: set the process meshes of ops with the nearest op before them + # Step 2.1: find the first op node which has the process mesh + idx_of_first_op_node_has_process_mesh = -1 + for idx, op_node in enumerate(ordered_op_nodes): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + if op_dist_attr.process_mesh is not None \ + and idx_of_first_op_node_has_process_mesh == -1: + idx_of_first_op_node_has_process_mesh = idx + # Reuse the following method to set the related tensors for same op node + self._update_process_mesh_by_nearest(op_node, op_node) + # Step 2.2: set the process meshes of ops by the nearest op node after the first op node + if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes): + return None + for idx, op_node in enumerate(ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh + 1:]): + original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1 + nearest_op_node = ordered_op_nodes[original_idx - 1] + nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + assert nearest_op_dist_attr.process_mesh is not None + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + # Step 2.3: set the process meshes of ops by the nearest op node before the first op node + nearest_op_node = ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh] + for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]: + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + + # Step 3: adjust the process meshes for special ops + self._update_process_mesh_for_specials() + + def _prepare(self): + self._while_op_nodes = {} + self._array_nodes = {} + self._node_pairs_between_graphs = [] + all_nodes = self._dist_context.serial_ordered_nodes + for idx, node in enumerate(all_nodes): + if node.is_op(): + if node.op().type() == "while": + self._while_op_nodes[_node_id(node)] = (node, idx) + if node.op().type() == "read_from_array": + array_var_name = node.op().input("X")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + if node.op().type() == "write_to_array": + array_var_name = node.op().output("Out")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + self._array_nodes[array_var_name].append(node.outputs[0]) + if node.is_var() and node.var() is not None: + if node.node.graph_id() != 0: + for before_node in reversed(all_nodes[:idx]): + if before_node.is_var() and before_node.var() is not None \ + and before_node.node.graph_id() == node.node.graph_id() - 1 \ + and before_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (before_node, node)) + for after_node in all_nodes[idx + 1:]: + if after_node.is_var() and after_node.var() is not None \ + and after_node.node.graph_id() == node.node.graph_id() - 1 \ + and after_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (after_node, node)) + def complete_forward_annotation(self, serial_main_program): """ Complete annotation for the partial annotated serial_main_program. Arguments: @@ -336,24 +633,24 @@ class Completer: # Initialize distributed attributes for all var and op node in serial_main_program self._dist_context.init_dist_attr_for_program() + # print_program_with_dist_attr(serial_main_program, self._dist_context) # Initialize distributed attributes for all var and op node in graph self._dist_context.init_dist_attr_for_graph() + self._prepare() + self._update_process_mesh() - # Complete dims_mapping for each node self._update_dims_mapping() # Copy the corresponding distributed attribute from graph to serial_main_program self._dist_context.copy_dist_attr_from_graph_to_program() self._dist_context.clear_dist_info_for_graph() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) # Do the validation check and amend some completion self._dist_context.amend_dist_attr_for_program() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) self._dist_context.validate_dist_attr_for_program() return serial_main_program diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py index b27cd7a37c95626584194ae7bd619ab16a0e5ea7..8ec702ffcb0b65af96833b4d4d2be1c8ff08d788 100644 --- a/python/paddle/distributed/auto_parallel/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/dist_attribute.py @@ -175,6 +175,7 @@ class TensorDistributedAttribute: class OperatorDistributedAttribute: def __init__(self): self._process_mesh = None + self._op_type = None self._impl_type = None self._impl_idx = None self._inputs_dist_attrs = {} @@ -194,11 +195,23 @@ class OperatorDistributedAttribute: if isinstance(process_mesh, list): process_mesh = ProcessMesh(process_mesh) self._process_mesh = copy.deepcopy(process_mesh) + # In while op, the proess mesh is not shared by all inputs and outputs + if self._op_type == "while": + return None for dist_attr in self._inputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh for dist_attr in self._outputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh + @property + def op_type(self): + return self._op_type + + @op_type.setter + def op_type(self, op_type): + if op_type is not None: + self._op_type = op_type + @property def impl_type(self): return self._impl_type @@ -326,6 +339,8 @@ class OperatorDistributedAttribute: assert False, "No setter for {} in args {}.".format( key, dist_attr) # Make sure proscess_meshes in dist op be same + if self.op_type == "while": + return None process_meshes = [] process_meshes.append(self.process_mesh) for tensor_dist_attr in self.inputs_dist_attrs.values(): diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 573f23fdca519ae1da10d62ef7eb2da6238805f3..2807c46540ab1e52f7490c850faa34eac00c04db 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -15,6 +15,7 @@ import copy from collections import defaultdict from paddle.fluid import framework +from paddle.fluid.framework import get_flags, set_flags from paddle.fluid import core from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute @@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context): _g_default_distributed_context = dist_context +def _node_id(node): + return (node.node.graph_id(), node.node.id()) + + class DistributedContext: """ DistributedContext is used to collect related distributed information for program and graph. @@ -146,7 +151,7 @@ class DistributedContext: return None def get_dist_tensor_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) return self._dist_tensors_for_graph.get(serial_tensor_node_id, None) def get_dist_op_for_program(self, serial_op): @@ -168,7 +173,7 @@ class DistributedContext: del self._dist_ops_for_program[serial_tensor_id] def get_dist_op_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) return self._dist_ops_for_graph.get(serial_op_node_id, None) def get_tensor_dist_attr_for_program(self, serial_tensor): @@ -197,7 +202,7 @@ class DistributedContext: self.add_dist_tensor_for_program(dist_tensor) def get_tensor_dist_attr_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id, None) if dist_tensor: @@ -242,7 +247,7 @@ class DistributedContext: self.add_dist_op_for_program(dist_op) def get_op_dist_attr_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -262,7 +267,7 @@ class DistributedContext: def get_dist_attr_for_graph(self, serial_node): if serial_node.is_var() and serial_node.var() is not None: - serial_tensor_node_id = serial_node.id() + serial_tensor_node_id = _node_id(serial_node) dist_tensor = self._dist_tensors_for_graph.get( serial_tensor_node_id, None) if dist_tensor: @@ -270,7 +275,7 @@ class DistributedContext: else: return None if serial_node.is_op() and serial_node.op() is not None: - serial_op_node_id = serial_node.id() + serial_op_node_id = _node_id(serial_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -311,40 +316,69 @@ class DistributedContext: def order_nodes_by_program_order(self): def _contains(nodes, target_node): for node in nodes: - if node.id() == target_node.id(): + if _node_id(node) == _node_id(target_node): return True return False - ordered_tensor_nodes = [] - ordered_op_nodes = [] - all_nodes = self._serial_graph.all_nodes() + serial_ordered_tensor_nodes = [] + serial_ordered_op_nodes = [] + all_nodes = [] + # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for node in graph.all_nodes(): + all_nodes.append(node) for node in all_nodes: if node.is_var() and node.var() is not None: - ordered_tensor_nodes.append(node) + serial_ordered_tensor_nodes.append(node) if node.is_op() and node.op() is not None: - ordered_op_nodes.append(node) - ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) - ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id()) - for op_node in ordered_op_nodes: + serial_ordered_op_nodes.append(node) + serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + num_nodes_before = len(serial_ordered_tensor_nodes) + len( + serial_ordered_op_nodes) + + new_serial_ordered_tensor_nodes = [] + new_serial_ordered_op_nodes = [] + for op_node in serial_ordered_op_nodes: tensor_nodes = [] for tensor_node in op_node.inputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) self._serial_ordered_nodes.append(op_node) + new_serial_ordered_op_nodes.append(op_node) tensor_nodes = [] for tensor_node in op_node.outputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) + tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) - num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes) - assert len(self._serial_ordered_nodes) == num_nodes_before, \ - "The number of nodes before ordering is not the same after ordering." + new_serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + new_serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes + self._serial_ordered_op_nodes = new_serial_ordered_op_nodes + assert len(self._serial_ordered_nodes) == len( + self._serial_ordered_tensor_nodes) + len( + self._serial_ordered_op_nodes) + self._serial_orphan_tensor_nodes = [] + for tensor_node in serial_ordered_tensor_nodes: + if not _contains(self._serial_ordered_tensor_nodes, tensor_node): + self._serial_orphan_tensor_nodes.append(tensor_node) + if len(self._serial_ordered_nodes) != num_nodes_before: + print( + "WARNING: there are some orphan tensors or ops which are not used in the execution." + ) def init_dist_attr_for_graph(self): assert self._is_initialized_for_program, \ @@ -352,9 +386,9 @@ class DistributedContext: if self._is_initialized_for_graph: return # Convert program to graph + set_flags({"FLAGS_convert_all_blocks": True}) self._serial_graph = framework.IrGraph( core.Graph(self._serial_program.desc)) - all_nodes = self._serial_graph.all_nodes() self.order_nodes_by_program_order() for node in self.serial_ordered_nodes: if node.is_var() and node.var() is not None: @@ -365,10 +399,11 @@ class DistributedContext: if tensor_id == cur_tensor_id \ or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id(): dist_tensor = cur_dist_tensor - self._node_id_to_tensor_id[node.id()] = cur_tensor_id + self._node_id_to_tensor_id[_node_id( + node)] = cur_tensor_id assert dist_tensor is not None, \ "Tensor must have a distributed tensor after the initialization for program." - serial_tensor_node_id = node.id() + serial_tensor_node_id = _node_id(node) new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, dist_tensor.dist_attr) self._dist_tensors_for_graph[ @@ -381,10 +416,10 @@ class DistributedContext: if op_id == cur_op_id \ or op_id == cur_dist_op.serial_op.desc.original_id(): dist_op = cur_dist_op - self._node_id_to_op_id[node.id()] = cur_op_id + self._node_id_to_op_id[_node_id(node)] = cur_op_id assert dist_op is not None, \ "Operator must have a distributed operator after the initialization for program." - serial_op_node_id = node.id() + serial_op_node_id = _node_id(node) new_dist_op = DistributedOperator(dist_op.serial_op, dist_op.dist_attr) self._dist_ops_for_graph[serial_op_node_id] = new_dist_op @@ -402,10 +437,11 @@ class DistributedContext: assert self._is_initialized_for_program and self._is_initialized_for_graph, \ "Both program and graph must be initialized." updated_tensors = {} - all_nodes = self._serial_graph.all_nodes() + # all_nodes = self._serial_graph.all_nodes() + all_nodes = self._serial_ordered_nodes for node in all_nodes: if node.is_var() and node.var() is not None: - tensor_id = self._node_id_to_tensor_id[node.id()] + tensor_id = self._node_id_to_tensor_id[_node_id(node)] updated = updated_tensors.get(tensor_id, False) # If a var has multiples var nodes in graph, only use the first one for now if not updated: @@ -416,16 +452,31 @@ class DistributedContext: dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph updated_tensors[tensor_id] = True if node.is_op() and node.op() is not None: - op_id = self._node_id_to_op_id[node.id()] + op_id = self._node_id_to_op_id[_node_id(node)] op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node) dist_op_for_program = self._dist_ops_for_program[op_id] dist_op_for_program.dist_attr = op_dist_attr_for_graph + # TODO: the completion algorithm will skip orphan tensors, + # here we just set there process_mesh to the first one. + for orphan_node in self._serial_orphan_tensor_nodes: + serial_tensor_id = orphan_node.var().id() + dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, + None) + if dist_tensor: + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] + else: + serial_tensor_id = orphan_node.var().original_id() + dist_tensor = self._dist_tensors_for_program.get( + serial_tensor_id, None) + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] def amend_dist_attr_for_program(self): for dist_tensor in self._dist_tensors_for_program.values(): serial_tensor = dist_tensor.serial_tensor dist_attr = dist_tensor.dist_attr - if serial_tensor.type == core.VarDesc.VarType.READER: + if serial_tensor.type == core.VarDesc.VarType.READER \ + or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = serial_tensor.shape @@ -446,6 +497,7 @@ class DistributedContext: tensor_shape = [] else: if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ or dist_op.serial_op.type == "create_py_reader": tensor_shape = [] else: @@ -459,8 +511,9 @@ class DistributedContext: and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: dims_mapping[i] = -1 for arg_name in serial_op.output_arg_names: - if dist_op.get_serial_output( - arg_name).type == core.VarDesc.VarType.READER: + if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = dist_op.get_serial_output(arg_name).shape @@ -498,7 +551,8 @@ class DistributedContext: for k, v in self.__dict__.items(): if k == "_serial_program" or k == "_serial_graph" \ or k == "_dist_main_programs" or k == "_dist_startup_programs" \ - or k == "_serial_ordered_nodes": + or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \ + or k == "_serial_ordered_op_nodes": setattr(result, k, v) else: setattr(result, k, copy.deepcopy(v, memo)) diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py index 67de298564afc8caddad90d228131f1795f5707e..a2c2748a8cea390003dfec857a252b7df3ee1b05 100644 --- a/python/paddle/distributed/auto_parallel/dist_op.py +++ b/python/paddle/distributed/auto_parallel/dist_op.py @@ -76,7 +76,8 @@ class DistributedOperator: if tensor is None: tensor_shape = [] else: - if tensor.type == core.VarDesc.VarType.READER: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: tensor_shape = [] else: tensor_shape = tensor.shape @@ -86,7 +87,9 @@ class DistributedOperator: tensor_dims_mapping) for tensor_name in self._serial_op.output_arg_names: tensor = self._serial_op.block._var_recursive(tensor_name) - if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = tensor.shape @@ -95,6 +98,8 @@ class DistributedOperator: tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))] self._dist_attr.set_output_dims_mapping(tensor_name, tensor_dims_mapping) + if self._dist_attr.op_type is None: + self._dist_attr.op_type = self.serial_op.type if self._dist_attr.impl_type is None: self._dist_attr.impl_type = "default" if self._dist_attr.impl_idx is None: @@ -134,12 +139,16 @@ class DistributedOperator: return new_dist_attr def validate_dist_attr(self): - if "read" in self.serial_op.type: + if "read" in self.serial_op.type or "while" == self.serial_op.type: return True for name in self.serial_op.input_arg_names: input_dist_attr = self.dist_attr.get_input_dist_attr(name) dims_mapping = input_dist_attr.dims_mapping - shape = self.get_serial_input(name).shape + if self.get_serial_input( + name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + shape = [] + else: + shape = self.get_serial_input(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -155,7 +164,11 @@ class DistributedOperator: for name in self.serial_op.output_arg_names: output_dist_attr = self.dist_attr.get_output_dist_attr(name) dims_mapping = output_dist_attr.dims_mapping - shape = self.get_serial_output(name).shape + if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\ + or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES: + shape = [] + else: + shape = self.get_serial_output(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -241,14 +254,14 @@ class DistributedModule: def __call__(self, *args, **kwargs): from .dist_context import get_default_distributed_context - main_prog = paddle.fluid.default_main_program() - main_block = main_prog.global_block() - op_size = len(main_block.ops) + default_prog = paddle.fluid.default_main_program() + cur_block = default_prog.current_block() + op_size = len(cur_block.ops) output = self._serial_module(*args, **kwargs) - new_op_size = len(main_block.ops) + new_op_size = len(cur_block.ops) default_dist_ctx = get_default_distributed_context() for idx in range(op_size, new_op_size): - op = main_block.ops[idx] + op = cur_block.ops[idx] dist_op = DistributedOperator(op, self._dist_attr) dist_op.dist_attr.mark_annotated_as(self._dist_attr) default_dist_ctx.add_dist_op_for_program(dist_op) diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py index 5e3c852699ab6f8dcb92b386989338e5ca3d2c1f..a42ce863492b3511e0e7ddfaa3a04b67f57e1157 100644 --- a/python/paddle/distributed/auto_parallel/dist_tensor.py +++ b/python/paddle/distributed/auto_parallel/dist_tensor.py @@ -184,7 +184,9 @@ class DistributedTensor: def _init_default_dist_attr(self): if self._dist_attr.dims_mapping is None: - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = self._serial_tensor.shape @@ -192,7 +194,9 @@ class DistributedTensor: self._dist_attr.dims_mapping = tensor_dims_mapping def validate_dist_attr(self): - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: return True tensor_shape = self.serial_tensor.shape if len(tensor_shape) != len(self.dist_attr.dims_mapping): diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 56beb8957415d3c3c401fdbf754cb17fc5e253a7..6bd1c5527a99e73ddcde1ada5f2a5a496c0d9933 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -259,7 +259,7 @@ class Engine: "train_" + name: val for name, val in logs.items() } - self._logger.info(logs) + self._logger.info(train_logs) def _train_step(self, data): logs = {} diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 4b079e7b6b575a6bcfd372782529ccc2958cf5db..47f76353e465529f1d29a05852a952d151c76c93 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute _g_distributed_operator_impl_containers = {} -_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"] +_g_elementwise_ops = [ + "elementwise_add", "gelu", "dropout", "cast", "gather", "concat" +] BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index 4e977007261a73e9b24a051f84e6e30f2bf9d860..de6d018d60521564ebc98b8df03e4b1356b846c8 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_dist_attr = dist_op.dist_attr for arg_name in op_desc.input_arg_names(): serial_tensor = dist_op.get_serial_input(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: if mapping != -1: @@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): xshape_arg_names = op_desc.output("XShape") for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: @@ -104,7 +114,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) # Check output compatibility output_names = op_desc.output_names() @@ -121,7 +132,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: if dims_mapping[0] != -1: return False @@ -129,7 +141,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[2:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[1]) + if len(dims_mapping) >= 2: + batch_dim_mappings.append(dims_mapping[1]) # Check batch dim mapping compatibility if not all(batch_dim_mappings[0] == dim_mapping @@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr # The following statement will be replaced by a more elegent way - if op_desc.type() == "shape" or op_desc.type() == "slice": + if op_desc.type() == "shape" \ + or op_desc.type() == "slice" \ + or op_desc.type() == "while": return False output_names = op_desc.output_names() xshape_arg_names = [] @@ -155,17 +170,22 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: batch_dim_mappings.append(dims_mapping[1]) + if not batch_dim_mappings: + return changed + compatible_dim_mapping = compute_compatible_dim_mapping( batch_dim_mappings) assert compatible_dim_mapping is not None, "There is no compatible dim mapping." @@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True for arg_name in op_desc.output_arg_names(): @@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True else: - if compatible_dim_mapping != dims_mapping[1]: + if len(dims_mapping + ) >= 2 and compatible_dim_mapping != dims_mapping[1]: dims_mapping[1] = compatible_dim_mapping changed = True diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 058ae1d0a9fd5c25ec83ea15ed9c2e479322957c..c92142cf7384d2b0c76c1a5cb3b4e6ac257303a2 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl): if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(y_dims_mapping[-2]): return False - return True def is_output_compatible(self, dist_op): diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py new file mode 100644 index 0000000000000000000000000000000000000000..140336566a146776f805f7b546fe6bb39c267861 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py @@ -0,0 +1,214 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class MetricRecord(object): + """ + One record for a single metric at a given execution step. + """ + + def __init__(self, value, step): + self._value = value + self._step = step + + @property + def value(self): + return self._value + + @value.setter + def value(self, value): + self._value = value + + @property + def step(self): + return self._step + + @step.setter + def step(self, step): + self._step = step + + def mean(self): + return np.mean(self.value) + + def get_state(self): + return {"value": self.value, "step": self.step} + + @classmethod + def from_state(cls, state): + return cls(**state) + + def __eq__(self, other): + if not isinstance(other, MetricRecord): + return False + return other.value == self.value and other.step == self.step + + def __repr__(self): + return "MetricRecord(value={}, step={})".format(self.value, self.step) + + +class MetricRecords(object): + """ + Records of a single metric across different executions. + """ + + def __init__(self, direction="min"): + if direction not in {"min", "max"}: + raise ValueError( + "direction should be one of {min, max}, but got: {}.".format( + direction)) + self._direction = direction + self._records = {} + + @property + def records(self): + return sorted(self._records.values(), key=lambda r: r.step) + + @records.setter + def records(self, records): + for r in records: + self.update(r.value, step=r.step) + + @property + def direction(self): + return self._direction + + @direction.setter + def direction(self, direction): + self._direction = direction + + def update(self, value, step=0): + if step in self._records: + self._records[step].set_value(value) + else: + self._records[step] = MetricRecord(value, step=step) + + def get_best_value(self): + values = list(r.mean() for r in self._records.values()) + if not values: + return None + if self._direction == "min": + return np.nanmin(values) + return np.nanmax(values) + + def get_best_step(self): + best_value = self.get_best_value() + if best_value is None: + return None + for r in self._records.values(): + if r.mean() == best_value: + return r.step + + def get_statistics(self): + records = self.records + records_values = [r.mean() for r in records] + if not len(records_values): + return {} + return { + "min": float(np.nanmin(records_values)), + "max": float(np.nanmax(records_values)), + "mean": float(np.nanmean(records_values)), + "median": float(np.nanmedian(records_values)), + "var": float(np.nanvar(records_values)), + "std": float(np.nanstd(records_values)), + } + + def get_state(self): + state = {} + state["direction"] = self._direction + state["records"] = [r.get_state() for r in self.records] + return state + + @classmethod + def from_state(cls, state): + records = cls(state["direction"]) + records.records = [MetricRecord.from_state(r) for r in state["records"]] + print("here 1", records.records) + return records + + +class MetricsRecorder(object): + """ + Record the values for all metrics. + """ + + def __init__(self, metrics=None): + self._records = {} + self.register_metrics(metrics) + + @property + def records(self): + return self._records + + def exists(self, name): + return name in self._records + + def register_metrics(self, metrics=None): + metrics = metrics or [] + for metric in metrics: + self.register(metric.name) + + def register(self, name, direction=None): + if self.exists(name): + raise ValueError("Metric {} have been registered.".format(name)) + if direction is None: + direction = "min" + self._records[name] = MetricRecords(direction) + + def update(self, name, value, step=0): + value = float(value) + if not self.exists(name): + self.register(name) + + prev_best = self._records[name].get_best_value() + self._records[name].update(value, step=step) + new_best = self._records[name].get_best_value() + + improved = new_best != prev_best + return improved + + def get_records(self, name): + return self._records[name].records + + def set_records(self, name, records): + if not self.exists(name): + self.register(name) + self._records[name].records = records + + def get_best_value(self, name): + return self._records[name].get_best_value() + + def get_best_step(self, name): + return self._records[name].get_best_step() + + def get_statistics(self, name): + return self._records[name].get_statistics() + + def get_state(self): + return { + "metrics": { + name: metric_records.get_state() + for name, metric_records in self._records.items() + } + } + + @classmethod + def from_state(cls, state): + recorder = cls() + recorder._records = { + name: MetricRecords.from_state(metric_records) + for name, metric_records in state["metrics"].items() + } + return recorder diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py new file mode 100644 index 0000000000000000000000000000000000000000..22a6638c5ca63b953dcac3d62c564acf6087a305 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/trial.py @@ -0,0 +1,114 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import random +import time +from enum import Enum + +from .storable import Storable +from .recorder import MetricsRecorder +from .tunable_space import TunableSpace + + +class TrialStatus: + RUNNING = "RUNNING" + COMPLETED = "COMPLETED" + STOPPED = "STOPPED" + INVALID = "INVALID" + + +class Trial(Storable): + def __init__(self, tunable_space, trial_id=None, + status=TrialStatus.RUNNING): + self._id = _generate_trial_id() if trial_id is None else trial_id + self._space = tunable_space + self._recorder = MetricsRecorder() + self._score = None + self._best_step = None + self._status = status + + @property + def id(self): + return self._id + + @property + def space(self): + return self._space + + @property + def recorder(self): + return self._recorder + + @property + def score(self): + return self._score + + @score.setter + def score(self, score): + self._score = score + + @property + def best_step(self): + return self._best_step + + @best_step.setter + def best_step(self, best_step): + self._best_step = best_step + + @property + def status(self): + return self._status + + @status.setter + def status(self, status): + self._status = status + + def summary(self): + print("Tunable space:") + if self.space.values: + for tv, value in self.space.values.items(): + print(tv + ":", value) + + if self.score is not None: + print("Score: {}".format(self.score)) + + def get_state(self): + return { + "id": self.id, + "space": self.space.get_state(), + "recorder": self.recorder.get_state(), + "score": self.score, + "best_step": self.best_step, + "status": self.status, + } + + def set_state(self, state): + self._id = state["id"] + self._space = TunableSpace.from_state(state["space"]) + self._recorder = MetricsRecorder.from_state(state["recorder"]) + self._score = state["score"] + self._best_step = state["best_step"] + self._status = state["status"] + + @classmethod + def from_state(cls, state): + trial = cls(tunable_space=None) + trial.set_state(state) + return trial + + +def _generate_trial_id(): + s = str(time.time()) + str(random.randint(1, int(1e7))) + return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32] diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 241eadcbace22cf36504e2c0ed36566fa94b9e4b..86c274cb45cc323dab60968571837e82619e6987 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context, used_dist_context._dist_op_context = DistributedOperatorContext() _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program( rank_id, used_dist_context) - # print("dist_main_program: ", dist_main_program) all_dist_main_program.append(dist_main_program) return all_dist_main_program diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386 --- /dev/null +++ b/python/paddle/distributed/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386 --- /dev/null +++ b/python/paddle/distributed/models/moe/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fd98c64318c60e2e67af320c51b24e39a3132c43 --- /dev/null +++ b/python/paddle/distributed/models/moe/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import in_dygraph_mode + + +def _number_count(gate_idx, upper_range): + """ + calculate the expert count according to the gate index. + Args: + gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64. + upper_range (int): The number of the experts. + Returns: + out (Tensor): The output expert count. + Examples: + .. code-block:: python + # required: distributed + import paddle + + gate_idx = [ + [0, 2], + [0, 2] + ] + upper_range = 6 + gate_idx = paddle.to_tensor(gate_idx, dtype="int32") + number_count = paddle.distributed.utils.number_count(gate_idx, upper_range) + print(number_count) # the result: [2, 0, 2, 0, 0, 0] + """ + if in_dygraph_mode(): + return core.ops.number_count(gate_idx, 'upper_range', upper_range) + else: + op_type = 'number_count' + + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype) + + helper.append_op( + type=op_type, + inputs={'gate_idx': gate_idx}, + outputs={'Out': out}, + attrs={'upper_range': upper_range}) + return out diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7480909a2d88dda51971d0ef66ae6c88a56cd79c..fb9e8d8ece100baa3ed7c65a8dc495aa12c254ff 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -228,3 +228,5 @@ if core.is_compiled_with_npu(): atexit.register(core.clear_executor_cache) # NOTE(Aganlengzi): clean up KernelFactory in advance manually. atexit.register(core.clear_kernel_factory) +# NOTE(wangran16): clean up DeviceManger in advance manually. +atexit.register(core.clear_device_manager) diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 80d2ccb0d5ca6fcb3a802014a860bfb2ff9b3400..9dba5d658dfc9f480c5e668be2c34b2bcb673078 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -173,6 +173,9 @@ if core.is_compiled_with_xpu(): elif core.is_compiled_with_npu(): _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'NPU', core.VarDesc.VarType.FP16) +elif core.is_compiled_with_mlu(): + _, _, _sys_unsupported_fp16_list = core.op_supported_infos( + 'MLU', core.VarDesc.VarType.FP16) else: _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'GPU', core.VarDesc.VarType.FP16) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 97b4116826a2a0e809af4a4129a0e953fd607b07..d614630b3db12d47d7b6790e40edefbd0402e384 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -979,8 +979,6 @@ class PostTrainingQuantization(object): if op.type in ( self._quantizable_op_type + self._out_scale_op_list): out_var_names = _get_op_output_var_names(op) - assert len(out_var_names) == 1, "Post training " + \ - "quantization only support one output for " + op.type for var_name in out_var_names: analysis_and_save_info(op, var_name) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index efa000274d01a601d6949c16803020272da918b4..afca617b6dd82b9106dbbb4a28e89090b0ad5278 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -59,6 +59,7 @@ _out_scale_op_list = [ "tanh", "prelu", "swish", + "dropout", "softmax", "batch_norm", "layer_norm", @@ -68,6 +69,8 @@ _out_scale_op_list = [ "transpose2", "concat", "elementwise_mul", + "elementwise_pow", + "elementwise_sub", "scale", "slice", "hard_swish", @@ -81,8 +84,54 @@ _out_scale_op_list = [ "flatten2", "transpose", "pad2d", + "pad3d", "reshape", - "layer_norm", + "split", + "flatten_contiguous_range", + "squeeze", + "squeeze2", + "nearest_interp_v2", + "fill_constant_batch_size_like", + "bilinear_interp", + "bilinear_interp_v2", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "equal", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pow", + "reduce_mean", + "stack", + "top_k_v2", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "gather", + "shuffle_channel", ] # list op real input and output names, to avoid processing input such as AxisTensor. @@ -119,7 +168,7 @@ _op_real_in_out_name = { "relu": [["X"], ["Out"]], "relu6": [["X"], ["Out"]], "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], + "prelu": [["X", "Alpha"], ["Out"]], "tanh": [["X"], ["Out"]], "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], @@ -127,16 +176,59 @@ _op_real_in_out_name = { "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], + "elementwise_pow": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], "lstm": [["Input", "Weight"], ["Hidden"]], "pad2d": [["X"], ["Out"]], + "pad3d": [["X"], ["Out"]], "flatten": [["X"], ["Out"]], "flatten2": [["X"], ["Out"]], "unsqueeze2": [["X"], ["Out"]], - "flatten_contiguous_range": [['X'], ["Out"]], + "unsqueeze2": [["X"], ["Out"]], + "flatten_contiguous_range": [["X"], ["Out"]], + "split": [["X"], ["Out"]], + "squeeze2": [["X"], ["Out"]], + "nearest_interp_v2": [["X"], ["Out"]], + "bilinear_interp": [["X"], ["Out"]], + "bilinear_interp_v2": [["X"], ["Out"]], + "fill_constant_batch_size_like": [["Input"], ["Out"]], + "arg_max": [["X"], ["Out"]], + "abs": [["X"], ["Out"]], + "assign": [["X"], ["Out"]], + "cast": [["X"], ["Out"]], + "clip": [["X"], ["Out"]], + "box_coder": [["PriorBox"], ["OutputBox"]], + "crop": [["X"], ["Out"]], + "cumsum": [["X"], ["Out"]], + "expand_v2": [["X"], ["Out"]], + "fill_any_like": [["X"], ["Out"]], + "fill_constant": [[], ["Out"]], + "gelu": [["X"], ["Out"]], + "instance_norm": [["X"], ["Out"]], + "lookup_table": [["W", "Ids"], ["Out"]], + "lookup_table_v2": [["W", "Ids"], ["Out"]], + "norm": [["X"], ["Norm"]], + "p_norm": [["X"], ["Out"]], + "pow": [["X"], ["Out"]], + "reduce_mean": [["X"], ["Out"]], + "stack": [["X"], ["Y"]], + "top_k_v2": [["X"], ["Out", "Indices"]], + "logical_and": [["X", "Y"], ["Out"]], + "logical_not": [["X"], ["Out"]], + "meshgrid": [["X"], ["Out"]], + "roi_align": [["X", "ROIs"], ["Out"]], + "strided_slice": [["Input"], ["Out"]], + "where": [["Condition", "X", "Y"], ["Out"]], + "grid_sampler": [["X", "Grid"], ["Output"]], + "tile": [["X"], ["Out"]], + "group_norm": [["X"], ["Y", "Mean", "Variance"]], + "reduce_sum": [["X"], ["Out"]], + "square": [["X"], ["Out"]], + "softplus": [["X"], ["Out"]], + "shuffle_channel": [["X"], ["Out"]], } _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] @@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object): quantized ops's inputs. """ _supported_quantizable_op_type = [ - "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose", - "equal", "gather", "greater_equal", "greater_than", "less_equal", - "less_than", "mean", "not_equal", "reshape", "reshape2", - "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", - "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", - "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm", - "matmul_v2" + "pool2d", + "elementwise_add", + "concat", + "softmax", + "argmax", + "transpose", + "equal", + "gather", + "greater_equal", + "greater_than", + "less_equal", + "less_than", + "mean", + "not_equal", + "reshape", + "reshape2", + "dropout", + "bilinear_interp", + "nearest_interp", + "trilinear_interp", + "slice", + "squeeze", + "elementwise_sub", + "mul", + "matmul", + "relu", + "relu6", + "leaky_relu", + "tanh", + "swish", + "scale", + "transpose", + "transpose2", + "sigmoid", + "pad2d", + "flatten", + "flatten2", + "batch_norm", + "layer_norm", + "matmul_v2", + "split", + "flatten_contiguous_range", + "squeeze2", + "nearest_interp_v2", + "bilinear_interp", + "bilinear_interp_v2", + "fill_constant_batch_size_like", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "elementwise_mul", + "elementwise_pow", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "hard_sigmoid", + "hard_swish", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pad3d", + "pow", + "prelu", + "reduce_mean", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "shuffle_channel", ] # To be compatible with PaddleSlim, not remove _activation_type for now diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index 3fadf25150f9ef3556a343fdce8acc24d788f5dc..f97c2778c0918ecbfbed546089c17e9d505818cd 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -52,6 +52,30 @@ def parse_args(): '--debug', action='store_true', help='If used, the graph of Quant model is drawn.') + parser.add_argument( + '--quant_model_filename', + type=str, + default="", + help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.' + ) + parser.add_argument( + '--quant_params_filename', + type=str, + default="", + help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.' + ) + parser.add_argument( + '--save_model_filename', + type=str, + default="__model__", + help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.' + ) + parser.add_argument( + '--save_params_filename', + type=str, + default=None, + help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files' + ) test_args, args = parser.parse_known_args(namespace=unittest) return test_args, sys.argv[:1] + args @@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path, save_path, ops_to_quantize='', op_ids_to_skip='', - debug=False): + debug=False, + quant_model_filename='', + quant_params_filename='', + save_model_filename='', + save_params_filename=''): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): - if os.path.exists(os.path.join(original_path, '__model__')): - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe) + if not quant_model_filename: + if os.path.exists(os.path.join(original_path, '__model__')): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(original_path, + exe) + else: + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, 'model', 'params') else: [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe, - 'model', 'params') + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, quant_model_filename, + quant_params_filename) ops_to_quantize_set = set() print(ops_to_quantize) @@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path, graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): - fluid.io.save_inference_model(save_path, feed_target_names, - fetch_targets, exe, inference_program) + fluid.io.save_inference_model( + save_path, + feed_target_names, + fetch_targets, + exe, + inference_program, + model_filename=save_model_filename, + params_filename=save_params_filename) print( "Success! INT8 model obtained from the Quant model can be found at {}\n" .format(save_path)) @@ -109,4 +150,6 @@ if __name__ == '__main__': test_args, remaining_args = parse_args() transform_and_save_int8_model( test_args.quant_model_path, test_args.int8_model_save_path, - test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug) + test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug, + test_args.quant_model_filename, test_args.quant_params_filename, + test_args.save_model_filename, test_args.save_params_filename) diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py index 9bf45f4272738c69073d252371b6a6c59aaf15da..ec288a1287119dd436a91843b863ba355bba28fb 100644 --- a/python/paddle/fluid/contrib/sparsity/__init__.py +++ b/python/paddle/fluid/contrib/sparsity/__init__.py @@ -29,10 +29,11 @@ from .asp import decorate from .asp import prune_model from .asp import set_excluded_layers from .asp import reset_excluded_layers +from .supported_layer_list import add_supported_layer __all__ = [ 'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', 'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', 'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers', - 'reset_excluded_layers' + 'reset_excluded_layers', 'add_supported_layer' ] diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index ffa12ac70460084fd49a14d0193be6e913495b9a..30439ad736d26f3086a7f87d591aa68a59b7baa8 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -23,6 +23,8 @@ import paddle from paddle.fluid import global_scope, program_guard, layers from paddle.fluid.initializer import ConstantInitializer from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning from paddle.fluid import core OpRole = core.op_proto_and_checker_maker.OpRole @@ -292,8 +294,8 @@ class ASPHelper(object): 2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning. """ - MASK_APPENDDED_NAME = '_asp_mask' - SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'} + MASK_APPENDDED_NAME = 'asp_mask' + PADDLE_WEIGHT_SUFFIX = "w_" __asp_info = {} @@ -334,7 +336,6 @@ class ASPHelper(object): r""" This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`. """ - checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo) if main_program is None: main_program = paddle.static.default_main_program() @@ -345,33 +346,27 @@ class ASPHelper(object): weight_tensor = global_scope().find_var(param.name).get_tensor() weight_nparray = np.array(weight_tensor) - # The double transpose ops here make sure pruning direction consistent with cuSparseLt. - # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. - # cuSparseLt would prune matrix A along k dimension. - # In sparse training, layer weight matriices is viewed sparse matrix A, so - # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle - # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed - # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension - # of W^T, which is m dimension of W. Moreove, all mask generating functions in - # sparsity/utils is row-major pruning. That is the reason we have to transpose weight - # matrices beforce invoking create_mask. Then we transpose the result maks to make - # sure its shape to be the same as the input weight. - weight_sparse_mask = sparsity.create_mask( - weight_nparray.T, func_name=mask_algo, n=n, m=m).T - weight_pruned_nparray = np.multiply(weight_nparray, - weight_sparse_mask) + prune_func = ASPHelper._get_prune_func_by_name(param.name) + + weight_pruned_nparray, weight_sparse_mask = \ + prune_func(weight_nparray, m, n, mask_algo, param.name) + weight_pruned_nparray = weight_pruned_nparray.astype( + weight_nparray.dtype) weight_tensor.set(weight_pruned_nparray, place) - assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ - 'Pruning {} weight matrix failure!!!'.format(param.name) + if with_mask: weight_mask_param = global_scope().find_var( ASPHelper._get_mask_name(param.name)) assert weight_mask_param is not None, \ - 'Cannot find {} variable, please call ASPHelper.minimize' \ + 'Cannot find {} variable, please call optimizer.minimize (' \ + 'paddle.sparsity.decorate(optimizer).minimize(loss)' \ ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name)) weight_mask_tensor = weight_mask_param.get_tensor() + weight_sparse_mask = weight_sparse_mask.astype( + np.array(weight_mask_tensor).dtype) weight_mask_tensor.set(weight_sparse_mask, place) asp_info.update_masks(param.name, weight_sparse_mask) + return asp_info.masks.copy() @staticmethod @@ -384,7 +379,7 @@ class ASPHelper(object): Returns: string: The mask name of :attr:`param_name`. """ - return param_name + ASPHelper.MASK_APPENDDED_NAME + return param_name + "." + ASPHelper.MASK_APPENDDED_NAME @staticmethod def _get_not_ASP_relevant_vars(main_program): @@ -434,19 +429,46 @@ class ASPHelper(object): # fc_0.w_0 -> True # fc_0.b_0 -> False """ - if ASPHelper.MASK_APPENDDED_NAME in param_name: + param_name_list = param_name.split('.') + + if ASPHelper.MASK_APPENDDED_NAME in param_name_list: return False for layer in cls._get_program_asp_info(main_program).excluded_layers: if layer in param_name: return False - for name in ASPHelper.SUPPORTED_LAYERS: - if name in param_name and \ - ASPHelper.SUPPORTED_LAYERS[name] in param_name: - return True + if param_name in supported_layers_and_prune_func_map: + return True + + param_name_no_weight_suffix = param_name_list[0] + param_type_suffix = param_name_list[1] + layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix. + rfind('_')] + if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix: + return False + + if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \ + layer_name in supported_layers_and_prune_func_map: + return True + return False + @classmethod + def _get_prune_func_by_name(cls, param_name): + func = supported_layers_and_prune_func_map.get(param_name, None) + param_name_no_weight_suffix = param_name.split('.')[0] + if func is None: + func = supported_layers_and_prune_func_map.get( + param_name_no_weight_suffix, None) + if func is None: + layer_name = param_name_no_weight_suffix[: + param_name_no_weight_suffix. + rfind('_')] + func = supported_layers_and_prune_func_map.get(layer_name, + _default_pruning) + return func + @classmethod def _minimize(cls, optimizer, @@ -509,8 +531,7 @@ class ASPHelper(object): if ASPHelper._is_supported_layer(main_program, param_and_grad[0].name): mask_param = layers.create_parameter( - name=param_and_grad[0].name + - ASPHelper.MASK_APPENDDED_NAME, + name=ASPHelper._get_mask_name(param_and_grad[0].name), shape=param_and_grad[0].shape, dtype=param_and_grad[0].dtype, default_initializer=ConstantInitializer(value=1.0)) diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py new file mode 100644 index 0000000000000000000000000000000000000000..105c2ded9eee71f68344d99dba325fee0a155850 --- /dev/null +++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.fluid.contrib import sparsity +import threading + +__all__ = ['add_supported_layer'] + + +def _default_pruning(weight_nparray, m, n, func_name, param_name): + + checked_func_name = sparsity.CheckMethod.get_checking_method(func_name) + + # The double transpose ops here make sure pruning direction consistent with cuSparseLt. + # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. + # cuSparseLt would prune matrix A along k dimension. + # In sparse training, layer weight matrices is viewed sparse matrix A, so + # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle + # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed + # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension + # of W^T, which is m dimension of W. Moreove, all mask generating functions in + # sparsity/utils is row-major pruning. That is the reason we have to transpose weight + # matrices beforce invoking create_mask. Then we transpose the result mask to make + # sure its shape to be the same as the input weight. + weight_sparse_mask = sparsity.create_mask( + weight_nparray.T, func_name=func_name, n=n, m=m).T + weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask) + assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ + 'Pruning {} weight matrix failure!!!'.format(param_name) + return weight_pruned_nparray, weight_sparse_mask + + +# When value of given key in this DICT is None, +# ASP will call default pruning function in pruning stage. +_supported_layers_and_prune_func_map_lock = threading.Lock() +supported_layers_and_prune_func_map = {} + + +def add_supported_layer(layer, pruning_func=None): + r""" + Add supported layers and its corresponding pruning function. + + Args: + name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then + it would be turn to string internally. ASP would use this name to match parameter's name and call + its the corresponding pruning function. + pruning_func (function, optional): a function type which receives five argument (weight_nparray, + m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, + m, n, and func_name, please see `prune_model` for details. + """ + name = None + if isinstance(layer, str): + name = layer + elif isinstance(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + type(layer).__name__) + elif issubclass(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + layer.__name__) + else: + assert "The type of layer should be string of Layer, but got {}!".format( + type(layer)) + if pruning_func is None: + pruning_func = _default_pruning + _supported_layers_and_prune_func_map_lock.acquire() + supported_layers_and_prune_func_map.update({name: pruning_func}) + _supported_layers_and_prune_func_map_lock.release() + + +add_supported_layer('fc') +add_supported_layer('linear') +add_supported_layer('conv2d') diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 706ec0d523b938fda0501dfd04f1fc976bf6a26b..5385ac28b90f614fcd6003994b9a7000bc16702a 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._rcvd_idx += 1 self._batches_outstanding -= 1 else: + # NOTE: when _rcvd_idx catch up _send_idx, which means + # one of following: + # 1. all 2 * num_workers batches have been loaded + # and stored in _blocking_queue + # 2. all data drained + # we need to let _thread blocking at _data_queue + # get_data to inoccupy CPU, otherwise may occupy + # CPU time for model running # NOTE: in persistent workers mode, do not check data # drained here, simply let it go to _data_queue # reading to get _ResumeIteration @@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # may also be data in blocking queue if self._batches_outstanding < len(self._places): return None - continue if self._rcvd_idx in self._task_infos and \ len(self._task_infos[self._rcvd_idx]) == 3: diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 191661b7bf9d5a5b2877f22ebe6d2ec9124f2f96..4127f1e4449bf82aae294ce952122f1f8f6e775f 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -271,18 +271,28 @@ def amp_guard(enable=True, "current_tracer is None, maybe it is not in imperative mode.") # check device_type: - # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16. + # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16. # Maybe we will support cpu for bfloat16. if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False + # For npu: + if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'): + warnings.warn('NPUPlace only support float16 amp.') + enable = False # For xpu: if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') enable = False + # For mlu: + if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'): + warnings.warn('MLUPlace only support float16 amp.') + enable = False # For gpu float16: Compute Capability should >= 7. # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11. if tracer._expected_place.is_gpu_place(): diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index f7c2d6be574c4e17fa5ce6fa44ca4ecc55a5eb95..c57290861942b8020f6f55792c445d42a0578c90 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -105,9 +105,11 @@ class AmpScaler(object): "current_tracer is None, maybe it is not in imperative mode.") if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False @@ -286,14 +288,28 @@ class AmpScaler(object): ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 ) ] - if len(param_grads_fp16): - _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, - param_grads_fp16, - self._temp_found_inf_fp16) - if len(param_grads_fp32): - _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, - param_grads_fp32, - self._temp_found_inf_fp32) + if core.is_compiled_with_npu(): + float_status = _C_ops.alloc_float_status() + _C_ops.clear_float_status(float_status, float_status) + + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + float_status, param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + float_status, param_grads_fp32, + self._temp_found_inf_fp32) + else: + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + self._temp_found_inf_fp32) + if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index d0552ca41f0daf56ce23317dd06cb5744baaff84..d8b1883fc62a0fb4575a2e525d7d37a9029cf40d 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -35,6 +35,12 @@ final_state_name_mapping = { "x": "X", "out": "Out", }, + "pool2d": { + "final_op_name": "final_state_pool2d", + "x": "X", + "kernel_size": "ksize", + "out": "Out", + }, "abs": { "final_op_name": "final_state_abs", "x": "X", diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fd7226c48661fdb2cd4dcf7227d0f8383c6c9439..000f08b0a3e282d815c758b5a153ba53ff84c8e0 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): if dim_size == -1: assert unk_dim_idx == -1, ( "Only one dimension value of 'shape' in reshape can " - "be -1. But received shape[%d] is also -1." % dim_idx) + "be -1. But received shape[%d] is also -1.\n" + "\n\t# N = x.shape()[2]\t\t# N is an int. " + "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t" + "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])" + "\t# z.shape is [-1, -1, 4]\n\n" + " If your target shape in Reshape represents dynamic shape, " + "please turn it into a Tensor under @to_static. See above example for details." + % dim_idx) unk_dim_idx = dim_idx elif dim_size == 0: assert dim_idx < len(x.shape), ( diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b499a9e01c36eefcb9b6cb91956abc5ee0a99b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake + + +class MyOwnLayer(Layer): + def __init__(self): + super(MyOwnLayer, self).__init__() + + def forward(self, x): + return x + + +static_tensor = None +static_tensor_mask = None + + +def my_own_pruning(tensor, m, n, mask_algo, param_name): + global static_tensor + global static_tensor_mask + if static_tensor is None: + static_tensor = np.random.rand(*tensor.shape).astype(np.float32) + if static_tensor_mask is None: + static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32) + return static_tensor, static_tensor_mask + + +class TestASPAddSupportedLayer(unittest.TestCase): + def test_add_supported_layer_via_name(self): + sparsity.add_supported_layer("test_supported_1") + sparsity.add_supported_layer("test_supported_2", my_own_pruning) + sparsity.add_supported_layer(MyOwnLayer) + my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__) + + self.assertTrue( + "test_supported_1" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"] + == my_own_pruning) + self.assertTrue( + my_own_layer_name in supported_layers_and_prune_func_map) + + +class TestASPStaticCustomerizedPruneFunc(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + self.main_program = fluid.Program() + self.startup_program = fluid.Program() + + self.customer_prefix = "customer_layer" + + def build_model(): + img = fluid.data( + name='img', shape=[None, 3, 32, 32], dtype='float32') + label = fluid.data(name='label', shape=[None, 1], dtype='int64') + hidden = fluid.layers.conv2d( + input=img, num_filters=4, filter_size=3, padding=2, act="relu") + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, size=32, act='relu') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + return img, label, prediction + + with fluid.program_guard(self.main_program, self.startup_program): + self.img, self.label, self.predict = build_model() + self.supported_layer_count_ref = 5 + + self.place = paddle.CPUPlace() + if core.is_compiled_with_cuda(): + self.place = paddle.CUDAPlace(0) + self.exe = fluid.Executor(self.place) + + sparsity.add_supported_layer(self.customer_prefix, my_own_pruning) + + def test_inference_pruning(self): + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=False) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + def test_training_pruning(self): + with fluid.program_guard(self.main_program, self.startup_program): + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=self.predict, label=self.label)) + optimizer = sparsity.decorate( + fluid.optimizer.SGD(learning_rate=0.01)) + optimizer.minimize(loss, self.startup_program) + + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=True) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + mat_mask = np.array(fluid.global_scope().find_var( + sparsity.asp.ASPHelper._get_mask_name(param.name)) + .get_tensor()) + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + self.assertLessEqual( + np.sum(mat_mask.flatten() - static_tensor_mask.flatten( + )), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertTrue( + sparsity.check_sparsity( + mat_mask.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 1f7ae53acdf4536921ca25e27874279f271b4de8..a730d21afa57980538841a3ad7fe874fd2343d4a 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -9,6 +9,12 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + + py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS}) + py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) + py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) + py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py new file mode 100644 index 0000000000000000000000000000000000000000..ab704a6a25714ef2fb935d6e3e776105aa4142cc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py @@ -0,0 +1,152 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from paddle.distributed.auto_parallel.tuner import recorder as rd + + +class TestRecorder(unittest.TestCase): + def test_register(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + self.assertEqual(set(recorder.records.keys()), {"metric"}) + self.assertEqual(recorder.records["metric"].direction, "min") + + def test_exists(self): + recorder = rd.MetricsRecorder() + recorder.register("metric", direction="max") + self.assertTrue(recorder.exists("metric")) + + def test_update(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 4, 1000) + self.assertEqual(recorder.records["metric"].direction, "min") + self.assertEqual( + recorder.get_records("metric"), [rd.MetricRecord(4, 1000)]) + + def test_get_records(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 1, step=0) + recorder.update("metric", 2, step=1) + recorder.update("metric", 3, step=2) + recorder.update("metric", 4, step=3) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_set_records(self): + recorder = rd.MetricsRecorder() + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_get_best_value(self): + recorder = rd.MetricsRecorder() + recorder.register("metric_min", "min") + recorder.register("metric_max", "max") + + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_min"), 1) + + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_max"), 4) + + def test_get_best_step(self): + recorder = rd.MetricsRecorder() + + recorder.register("metric_min", "min") + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_min"), 0) + + recorder.register("metric_max", "max") + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_max"), 3) + + def test_get_statistics(self): + recorder = rd.MetricsRecorder() + records = [rd.MetricRecord(np.random.random(), i) for i in range(14)] + recorder.set_records("metric", records) + stats = recorder.get_statistics("metric") + records = [r.value for r in records] + self.assertEqual(stats["min"], np.min(records)) + self.assertEqual(stats["max"], np.max(records)) + self.assertEqual(stats["mean"], np.mean(records)) + self.assertEqual(stats["median"], np.median(records)) + self.assertEqual(stats["var"], np.var(records)) + self.assertEqual(stats["std"], np.std(records)) + + def test_serialization(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + print(recorder.get_state()) + new_recorder = rd.MetricsRecorder.from_state(recorder.get_state()) + self.assertEqual(new_recorder.records.keys(), recorder.records.keys()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py new file mode 100644 index 0000000000000000000000000000000000000000..fc52d1c394effc223a609ae5db73ea89a25c298b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_space as ts +from paddle.distributed.auto_parallel.tuner import trial as tr + + +class TestTiral(unittest.TestCase): + def test_trial(self): + space = ts.TunableSpace() + space.choice("choice", [0, 1, 2, 3], default=2) + trial = tr.Trial(space, trial_id="trial-1") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + self.assertEqual(trial.id, "trial-1") + self.assertEqual(trial.space.get_value("choice"), 2) + self.assertEqual(trial.best_step, 0) + self.assertEqual(trial.status, "RUNNING") + + def test_serialization(self): + space = ts.TunableSpace() + space.int_range("int_range", start=1, stop=4, default=2) + trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + new_trial = tr.Trial.from_state(trial.get_state()) + self.assertEqual(new_trial.id, "trial-2") + self.assertEqual(new_trial.space.get_value("int_range"), 2) + self.assertEqual(new_trial.best_step, 0) + self.assertEqual(new_trial.status, "COMPLETED") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py new file mode 100644 index 0000000000000000000000000000000000000000..1179fd9a9f0887f5133349118eb5b4c8fbab733d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py @@ -0,0 +1,209 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.nn as nn +import paddle.utils as utils +import paddle.static as static +import paddle.nn.functional as F +import paddle.distributed.auto_parallel as auto + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.completion import Completer +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.utils import make_data_unshard +from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context +from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [[0, 1], [2, 3]] + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1]) + yield batch_input, batch_label + + return __reader__ + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + param_initializer = nn.initializer.Normal( + mean=0.0, std=initializer_range) + + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear0 = nn.Linear( + d_model, + dim_feedforward, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + self.linear1 = nn.Linear( + dim_feedforward, + d_model, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + + def forward(self, input): + out = self.norm(input) + auto.shard_tensor( + self.linear0.weight, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, 0] + }) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _g_process_mesh[1], + "dims_mapping": [0, -1] + }) + out = self.linear1(out) + + return out + + +def loop_cond(i, loop_len, input_array): + return i < loop_len + + +def loop_body(i, loop_len, input_array): + pre_input = paddle.tensor.array_read(array=input_array, i=i) + mlp_while0 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + mlp_while1 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + output = mlp_while0(pre_input) + cur_pred = mlp_while1(output) + # 更新循环条件 + i = paddle.increment(x=i, value=1) + paddle.tensor.array_write(cur_pred, array=input_array, i=i) + return i, loop_len, input_array + + +def get_program(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + + # 循环计数器 + i = paddle.full(shape=[1], fill_value=0, dtype='int64') + # 循环次数 + loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64') + + # input + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + data_holder = [input, label] + # dataloader + dataloader = paddle.io.DataLoader.from_generator( + feed_list=data_holder, capacity=4 * batch_size, iterable=False) + dataloader.set_batch_generator( + batch_generator_creator(), places=paddle.static.cuda_places()) + # data dist_attr + auto.shard_tensor( + input, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + auto.shard_tensor( + label, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + + mlp_start = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_start(input) + + input_array = paddle.tensor.array_write(pred, i) + i, loop_len, input_array = static.nn.while_loop( + cond=loop_cond, + body=loop_body, + loop_vars=[i, loop_len, input_array]) + end_pred = paddle.tensor.array_read(array=input_array, i=i) + + mlp_end = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_end(end_pred) + + error_cost = paddle.nn.functional.square_error_cost(pred, label) + loss = paddle.mean(error_cost) + + return train_program, start_program, dataloader, i, loss + + +class TestMLP(unittest.TestCase): + def test_completer(self): + train_program, start_program, dataloader, i, loss = get_program() + dist_context = DistributedContext() + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) + # print_program_with_dist_attr(complete_train_program, dist_context) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py index 66c547de2c28068a9605c506416baabdf229265d..2e84607e2f5c24bde4b3d291cc1b7f33f89c5751 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py @@ -25,17 +25,120 @@ from hypothesis import given, settings, seed, example, assume import hypothesis.strategies as st -class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): +# the two inputs of elementwise_add are tensor +class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[1]['data_format'] == "NHWC": + if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0: + return False + if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1: return False + return True + + def sample_program_config(self, draw): + data_format = draw(st.sampled_from(["NCHW", "NHWC"])) + dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) + groups = draw(st.sampled_from([1, 2, 4])) + paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]])) + strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + axis = draw(st.sampled_from([-1, 0])) + batch_size = draw(st.integers(min_value=1, max_value=4)) + + def generate_input(): + if data_format == "NCHW": + return np.random.random( + [batch_size, 48, 64, 64]).astype(np.float32) + else: + return np.random.random( + [batch_size, 64, 64, 48]).astype(np.float32) + + def generate_weight(): + return np.random.random( + [48, int(48 / groups), 3, 3]).astype(np.float32) + + relu_op = OpConfig( + type="relu", + inputs={"X": ["input_data"]}, + outputs={"Out": ["relu_out"]}, + attrs={}) + + conv2d_op1 = OpConfig( + type="conv2d", + inputs={"Input": ["relu_out"], + "Filter": ["conv_weight1"]}, + outputs={"Output": ["conv_output1"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + conv2d_op2 = OpConfig( + type="conv2d", + inputs={"Input": ["input_data"], + "Filter": ["conv_weight2"]}, + outputs={"Output": ["conv_output2"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + elt_op = OpConfig( + type="elementwise_add", + inputs={"X": ["conv_output1"], + "Y": ["conv_output2"]}, + outputs={"Out": ["elementwise_output"]}, + attrs={'axis': axis}) + model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op] + + program_config = ProgramConfig( + ops=model_net, + weights={ + "conv_weight1": TensorConfig(data_gen=partial(generate_weight)), + "conv_weight2": TensorConfig(data_gen=partial(generate_weight)) + }, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["elementwise_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) + + +''' +class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + if "elementwise_weight" in program_config.weights: + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]: + if attrs[2]['axis'] != 1: + return False + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]: + if attrs[2]['axis'] != -1: + return False return True def sample_program_config(self, draw): @@ -101,7 +204,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): "strides": strides }) - if axis == -1 or axis == 0: + if axis == 0: elt_op = OpConfig( type="elementwise_add", inputs={"X": ["input_data1"], @@ -118,14 +221,12 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): model_net = [relu_op, conv2d_op, elt_op] - if axis == 1: + if axis == 0: program_config = ProgramConfig( ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)), - "elementwise_weight": - TensorConfig(data_gen=partial(generate_weight2)) + TensorConfig(data_gen=partial(generate_weight1)) }, inputs={ "input_data1": @@ -137,7 +238,9 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)) + TensorConfig(data_gen=partial(generate_weight1)), + "elementwise_weight": + TensorConfig(data_gen=partial(generate_weight2)) }, inputs={ "input_data1": @@ -154,7 +257,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): def test(self): self.run_and_statis( quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) - +''' if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 457f20ac5b06be0afb6929dc74148eb42b3ce4db..530ea2838a76fcf01a3047be56f46dea0232619e 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -715,10 +715,11 @@ class OpTest(unittest.TestCase): assert related_idx >= 0, "%d-th arguments don't have default value" % idx return defaults[related_idx] - def remove_name(x): - if isinstance(x, list): return [i for i in x if i != 'name'] + def filter_by_name(x): + names = set(['name', 'out', 'output']) + if isinstance(x, list): return [i for i in x if i not in names] if isinstance(x, dict): - return {k: v for k, v in x.items() if k != 'name'} + return {k: v for k, v in x.items() if k not in names} assert False, "Only support list or dict." def to_defaults_list(params, defaults): @@ -728,7 +729,7 @@ class OpTest(unittest.TestCase): # Because we don't know the python api name of each arguments. # using parse_arg_and_kwargs, we can get the all api information we need. api_params, api_defaults = [ - remove_name(item) for item in parse_arg_and_kwargs(api) + filter_by_name(item) for item in parse_arg_and_kwargs(api) ] api_defaults = to_defaults_list(api_params, api_defaults) inputs_sig, attrs_sig, outputs_sig = kernel_sig @@ -784,10 +785,10 @@ class OpTest(unittest.TestCase): block = fluid.default_main_program().global_block() op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) # prepare input variable - inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, - True, False, block) + eager_tensor_inputs = self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block) # prepare output variable - outputs = self.append_input_output_for_dygraph( + eager_tensor_outputs = self.append_input_output_for_dygraph( op_proto, self.outputs, False, False, block) # prepare attrbutes @@ -798,13 +799,14 @@ class OpTest(unittest.TestCase): attrs_outputs[attrs_name] = self.attrs[attrs_name] kernel_sig = _dygraph_tracer()._get_kernel_signature( - self.op_type, inputs, outputs, attrs_outputs) + self.op_type, eager_tensor_inputs, eager_tensor_outputs, + attrs_outputs) assert hasattr( self, "python_api" ), "Please set the `self.python_api` if you want to compare python api output." - args = prepare_python_api_arguments(self.python_api, inputs, - attrs_outputs, kernel_sig) + args = prepare_python_api_arguments( + self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig) """ we directly return the cal_python_api value because the value is already tensor. """ return cal_python_api(self.python_api, args, kernel_sig) @@ -1286,11 +1288,11 @@ class OpTest(unittest.TestCase): with _test_eager_guard(): eager_dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) - # we only check end2end api when check_eager=True - if hasattr(self, "python_api"): - api_outs = self._calc_python_api_output(place) - self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, - place) + # we only check end2end api when check_eager=True + if hasattr(self, "python_api"): + api_outs = self._calc_python_api_output(place) + self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, + place) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 50ea065209422d2c972e480fbbd9a9442b5e5c25..6c964a828eed7eb01bce68b81baab61c66c5cf43 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -123,17 +123,26 @@ class XPUOpTest(OpTest): return super().check_grad_with_place( place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, in_place, max_relative_error, - user_defined_grads, user_defined_grads, check_dygraph) + user_defined_grads, user_defined_grad_outputs, check_dygraph) a1 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a2 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a3 = self.get_grad_with_place( paddle.CPUPlace(), inputs_to_check, output_names, - no_grad_set=no_grad_set) + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) self._assert_is_close(a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu") self._assert_is_close(a1, a3, inputs_to_check, max_relative_error, @@ -147,7 +156,7 @@ class XPUOpTest(OpTest): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None, + user_defined_grad_outputs=None, check_dygraph=True): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() @@ -197,6 +206,10 @@ class XPUOpTest(OpTest): if not type(output_names) is list: output_names = [output_names] - analytic_grads = self._get_gradient(inputs_to_check, place, - output_names, no_grad_set) + analytic_grads = self._get_gradient( + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) return analytic_grads diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py index 13e763bee630517d1caaa5432f86882c37aab449..43b5ce96a390150db7e29588e4107271b240b23f 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum.py +++ b/python/paddle/fluid/tests/unittests/test_einsum.py @@ -26,14 +26,14 @@ class TestErrors(unittest.TestCase): def test_diagonalize_errors(self): a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') a = paddle.to_tensor(a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('...ii->...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i->i...', a) def test_param_errors(self): @@ -396,6 +396,51 @@ class TestNumpyTests(unittest.TestCase): self.check_output('a...b,b...c,c...a', a, a, a) self.check_output('...ab,...ba,...ab,...ab', a, a, a, a) + def test_static_graph(self): + paddle.enable_static() + fluid = paddle.fluid + if fluid.core.is_compiled_with_cuda(): + self.place = fluid.CUDAPlace(0) + else: + self.place = fluid.CPUPlace() + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + a = paddle.static.data( + name='a', shape=[3, None, None, None], dtype='float') + b = paddle.static.data( + name='b', shape=[2, None, None, None], dtype='float') + c = paddle.static.data( + name='c', shape=[None, None, 2, None], dtype='float') + d = paddle.static.data( + name='d', shape=[None, None, 5], dtype='float') + e = paddle.static.data( + name='e', shape=[None, 2, None], dtype='float') + + outs = [] + outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b)) + outs.append(paddle.einsum('...ik, ...j', c, d)) + outs.append(paddle.einsum('...kj, ...ik', d, e)) + outs.append(paddle.einsum('ijk..., ikj', c, e)) + outs.append(paddle.einsum('ijk..., ikj->...ij', c, e)) + exe = fluid.Executor(self.place) + exe.run(startup) + a = np.arange(72).reshape(3, 2, 3, 4).astype('float') + b = np.arange(48).reshape(2, 2, 3, 4).astype('float') + c = np.arange(48).reshape(2, 3, 2, 4).astype('float') + d = np.arange(30).reshape(2, 3, 5).astype('float') + e = np.arange(12).reshape(2, 2, 3).astype('float') + feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e} + actual = exe.run(main, feed=feeds, fetch_list=[outs]) + expect = [] + expect.append(np.einsum("ibnd,jbnd->bnij", a, b)) + expect.append(np.einsum('...ik, ...j', c, d)) + expect.append(np.einsum('...kj, ...ik', d, e)) + expect.append(np.einsum('ijk..., ikj', c, e)) + expect.append(np.einsum('ijk..., ikj->...ij', c, e)) + for a, e in zip(actual, expect): + self.check_output_equal(a, e) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d1d391a3949ead28697c0756803e873c41914079..318e826058f2c111f825b113c8ee4676ff87d630 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 00967cb503fe5fd677839a869798964bb5fb0b71..b35b2840ed30a2650e6e19a4cfbc381f50fd5024 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index d2bffbe074f2a9bf63975831560597067508aaf5..0ae005430e03b046d609c393fcc0641a0d3db49e 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): set(parameters), set([ 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0', - 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask', - 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask', - 'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0', + 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask', + 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask', + 'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0', 'fc_2.b_0_velocity_0' ])) self.assertEqual(ops, [ diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0df9d2a3a41b44c18b7e008a271c10544ec4dfa0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import op_test +import numpy as np +import unittest +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.backward import append_backward +from paddle.distributed.models.moe import utils + + +def count(x, upper_range): + res = np.zeros((upper_range, )).astype(int) + for i in x.reshape(-1): + if i >= 0 and i < len(res): + res[i] += 1 + return res + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountOpInt64(op_test.OpTest): + def setUp(self): + expert_num = 16 + self.op_type = "number_count" + x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64') + self.inputs = {'gate_idx': x} + self.outputs = {'Out': count(x, expert_num)} + self.attrs = {"upper_range": expert_num} + + def test_forward(self): + self.check_output_with_place(paddle.CUDAPlace(0)) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountAPI(unittest.TestCase): + def setUp(self): + self.upper_range = 320 + self.x = np.random.randint( + -1, self.upper_range, size=(6000, 200)).astype('int64') + self.out = count(self.x, self.upper_range) + self.place = paddle.CUDAPlace(0) + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('x', self.x.shape, dtype="int64") + out = utils._number_count(x, self.upper_range) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x}, fetch_list=[out]) + assert np.allclose(res, self.out) + + def test_api_dygraph(self): + paddle.disable_static() + x = paddle.to_tensor(self.x) + out = utils._number_count(x, self.upper_range) + assert np.allclose(out.numpy(), self.out) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 4361a45f1568f5f047ee03090bd3ef28a8d6654f..2380ccb14aaeede7474c99ccbd2d5805849c2118 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import paddle import paddle.fluid.core as core from paddle.static import program_guard, Program +import os def check_randperm_out(n, data_np): @@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + + x = paddle.randperm(30000, dtype='int32').numpy() + expect = [ + 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='int64').numpy() + expect = [ + 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float32').numpy() + expect = [ + 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., + 22906., 10705. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., + 26991. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., + 4662. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float64').numpy() + expect = [ + 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., + 9163. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., + 17624. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., + 21404. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index a3bfe3864a2493fdcf100a1a86648a159701ec11..beaf361379b94dd28997a6186a58608694a20eca 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -333,7 +333,8 @@ class TestVariable(unittest.TestCase): with self.assertRaises(IndexError): res = x[[True, False, False]] with self.assertRaises(ValueError): - res = x[[False, False]] + with paddle.static.program_guard(prog): + res = x[[False, False]] def test_slice(self): places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 69bca8dd9ef15459021f44fd1b4887e636516ec6..66f2e871dac462c8e6e47357e7367755d2fc0cfc 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -849,6 +849,38 @@ def ref_softsign(x): return out +class XPUTestSoftshrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softshrink' + self.use_dynamic_create_class = False + + class XPUTestSoftshrink(TestActivationOPBase): + def set_case(self): + self.op_type = "softshrink" + self.dtype = self.in_type + + threshold = 0.5 + np.random.seed(1023) + x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype) + out = ref_softshrink(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softshrink') +for stype in support_types: + create_test_class(globals(), XPUTestSoftshrinkOP, stype) + + +def ref_softshrink(x, threshold=0.5): + out = np.copy(x) + out = (out < -threshold) * (out + threshold) + (out > threshold) * ( + out - threshold) + return out + + class XPUTestSwishOP(XPUOpTestWrapper): def __init__(self): self.op_name = 'swish' @@ -879,5 +911,36 @@ def ref_swish(x): return out +class XPUTestThresholdedReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'thresholded_relu' + self.use_dynamic_create_class = False + + class XPUTestThresholdedRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "thresholded_relu" + self.dtype = self.in_type + + threshold = 1.0 + np.random.seed(1024) + x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_thresholded_relu(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('thresholded_relu') +for stype in support_types: + create_test_class(globals(), XPUTestThresholdedReluOP, stype) + + +def ref_thresholded_relu(x, threshold=1.0): + out = (x > threshold) * x + return out + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index 9f577d5ff38024fe9264deec2980ff091996a1d8..2d0b079ee9280e2dd0cc0e62c6c5932565ba9dfd 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None): where sum is over j such that `segment_ids[j] == i`. Args: - data (Tensor): A tensor, available data type float32, float64. + data (Tensor): A tensor, available data type float32, float64, int32, int64. segment_ids (Tensor): A 1-D tensor, which have the same size with the first dimension of input data. Available data type is int32, int64. @@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None): of all index 'segment_ids[j] == i'. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None): where min is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None): where max is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index 5167c18de179dabc4b25bf077d6a81b6ef0b8bf6..6c575b4b997d661d8be79c4e0b457c6a2f34795c 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -59,16 +59,14 @@ class SGD(Optimizer): .. code-block:: python import paddle - import numpy as np - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32') linear = paddle.nn.Linear(10, 10) inp = paddle.to_tensor(inp) out = linear(inp) loss = paddle.mean(out) - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) - back = out.backward() + out.backward() sgd.step() sgd.clear_grad() diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 040480c26faa8fca3e9e08cf0b69cc6cdaaeedfc..06c2a82fd696d8f84f6ce1b38fad95834d10d3a0 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -13,9 +13,10 @@ # limitations under the License. import itertools +import numpy as np import re -from .linalg import matmul, transpose +from .linalg import dot, matmul, transpose from .manipulation import squeeze, unsqueeze, reshape from .math import multiply from .math import sum as paddle_sum @@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims): f"Invalid equation: duplicate output labels are found.") -# ''' -# Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. -# We follow the Numpy broadcasting convention which states that, by lining up the shape arrays -# starting from the right most dimension, all the aligned dimensions either have equal sizes or -# one of them is sized one. -# Parameters -# ---------- -# args: -# *args unpacks into operand one's axes range, shape, operand two's axes range, shape -# f: -# if available, is used as a callback for postprocessing the aligned operand dimensions. -# ''' -# xran, xshape, yran, yshape = args -# -# xran_inv, yran_inv = xran[::-1], yran[::-1] -# -# for xi, yi in zip(xran_inv, yran_inv): -# xs, ys = xshape[xi], yshape[yi] -# cond = xs == ys or xs == 1 or ys == 1 -# if not cond: -# return False -# -# if not f: -# return True -# -# # Apply the callback to each aligned dimension pair -# for xi, yi in zip(xran_inv, yran_inv): -# f(xi, yi) - - def build_view(in_labels, out_labels): ''' Build an inverse map of dimension indices. Three conditions must hold for @@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes): g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape] - g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes] + g_masks = [[s > 1 or s == -1 for s in view_shape] + for view_shape in view_shapes] return g_shape, g_masks -def dim_strides(shape): - ''' - Returns the dimension strides for a tensor shape - ''' - strides = [] - stride = 1 - for size in shape[::-1]: - strides.append(stride) - stride = stride * size - return strides - - -def create_view(operand, *view_def): - ''' - Create and materialize a view. - - Parameters - ---------- - operand: - the base tensor operand - view_def: - include two lists which define the view's dimension sizes and strides - ''' - assert False, f'Diagonal and trace not implemented yet.' - view_shape, view_strides = view_def - return operand.create_view(view_shape, view_strides) - - def has_duplicated_labels(labels): ''' Returns True if there is any duplicate label. @@ -337,46 +281,17 @@ def diagonalize(labels, operand): Merges dimensions with duplicate labels. For those dimensions with duplicate labels, merge them into one dimension - which represents the diagonal elements. That requires the duplicate labeled - dimensions equal sized. The order of dimensions is kept unchanged up to - the left-most appearance of each label. + which represents the diagonal elements. This requires the dimensions with + duplicate labels are equal sized. Examples -------- 'ijj...i' would be merged into 'ij...' ''' - if not has_duplicated_labels(labels): - return labels, operand - - strides = dim_strides(operand.shape) - shape = operand.shape - new_labels = [] - new_shape = [] - new_strides = [] - - for ax, l in enumerate(labels): - if l == '.' or l not in new_labels: - # not duplicate - new_labels.append(l) - new_strides.append(strides[ax]) - new_shape.append(shape[ax]) - else: - # duplicate label - diag_ax = new_labels.index(l) - new_strides[diag_ax] += strides[ax] + assert not has_duplicated_labels(labels), ( + f'Duplicate labels are not supported.') - # Call framework API to build a new tensor - new_op = create_view(operand, new_shape, new_strides) - return new_labels, new_op - - -def prod(iter, default=1): - if len(iter): - res = 1 - for s in iter: - res *= s - return res - return default + return labels, operand def plan_reduce(plan, op, reduce_dims, keepdim): @@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): op1_view, op2_view = [g_view[op] for op in (op1, op2)] - # Note, I may index into -1 - I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0] - I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0] - J1_dims = [op1_view[ax] for ax in J1] - J2_dims = [op2_view[ax] for ax in J2] - K1_dims = [op1_view[ax] for ax in K] - K2_dims = [op2_view[ax] for ax in K] + I1 = [idx for idx in I if op1_view[idx] >= 0] + I2 = [idx for idx in I if op2_view[idx] >= 0] + op1_view = np.array(op1_view) + op1_dims = op1_view[I1 + J1 + K] - op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] - op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)] - op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)] - - I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes] - for axes in (I, J1, K)] - I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes] - for axes in (I, J2, K)] + op2_view = np.array(op2_view) + op2_dims = op2_view[I2 + J2 + K] - K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape) + op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] + op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)]) + op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)]) + vshape = np.maximum(op1_vshape, op2_vshape) - perm1 = I1_dims + J1_dims + K1_dims - perm2 = I2_dims + J2_dims + K2_dims + i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K)) - if any(i != dim for i, dim in enumerate(perm1)): + if any(op1_dims != np.arange(len(op1_dims))): # print(f'perm1: {perm1}') - step = transpose, [var1], var1, perm1 + step = transpose, [var1], var1, list(op1_dims) plan.add_step(step) - if any(i != dim for i, dim in enumerate(perm2)): + if any(op2_dims != np.arange(len(op2_dims))): # print(f'perm2: {perm2}') - step = transpose, [var2], var2, perm2 + step = transpose, [var2], var2, list(op2_dims) plan.add_step(step) - # In case of no K... dimensions, do a broadcast - if not K: - # unsqueeze operands include J1...J2... dimensions - if J2: - fill_start = len(I2_dims) + len(J1) - fill_end = fill_start + len(J2) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var1], var1, fill - plan.add_step(step) - if J1: - fill_start = len(I2_dims) - fill_end = fill_start + len(J1) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var2], var2, fill - plan.add_step(step) - # make broadcast - step = multiply, [var1, var2], var2 - plan.add_step(step) - # K... are there, let's reason about I... and J... - # In case I... and J... are empty, do the vector-vector version of matmul - elif not I and not J1 and not J2: - # merge K dimensions - if len(K) > 1: - for var in var1, var2: - step = reshape, [var], var, [K1_size] - plan.add_step(step) - # Build vector-vector matmul - step = matmul, [var1, var2], var2 - plan.add_step(step) - # General case, there are K... and some I... and J..., the actual operation will be - # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes. - else: - # Merge J dims and K dims by reshaping - merged_shape1 = I1_shape + [J1_size] + [K1_size] - merged_shape2 = I2_shape + [J2_size] + [K1_size] + # Check if conditions hold for turnning the operation into a matmul + if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate( + (op1_vshape, op2_vshape)): + op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1]) + ] + [np.prod(op1_vshape[K])] + op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2]) + ] + [np.prod(op2_vshape[K])] - step = reshape, [var1], var1, merged_shape1 + # Merge J dims and K dims by reshaping + step = reshape, [var1], var1, op1_shape plan.add_step(step) - step = reshape, [var2], var2, merged_shape2 + step = reshape, [var2], var2, op2_shape plan.add_step(step) # Matmul step = matmul, [var1, var2], var2, False, True plan.add_step(step) - # The result shape is in I..., J1, J2. Let's reshape back to known dimensions - # Note, this is static deduction, not by reading the tensor shape at runtime - result_shape = [1] * len(I) - for i, ax in enumerate(I): - result_shape[i] = max(op1_vshape[ax], op2_vshape[ax]) - if J1: - result_shape += J1_shape - if J2: - result_shape += J2_shape - - # Need a scalar dimension somehow - if result_shape: - step = reshape, [var2], var2, result_shape + # Reshape back + shape = list(vshape[I + J1 + J2]) + step = reshape, [var2], var2, shape plan.add_step(step) + elif j1 == j2 == k == 1: + # Can still do matmul even unknown shapes are present + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + + # In the rest cases we opt for ops other than matmul + else: + # unsqueeze operands include J1...J2... dimensions + if j2: + fill = list(range(i1 + j1, i1 + j1 + j2)) + step = unsqueeze, [var1], var1, fill + plan.add_step(step) + if j1: + fill = list(range(i2, i2 + j1)) + step = unsqueeze, [var2], var2, fill + plan.add_step(step) + # In case of no dimensions to contract, do an elementwise multiply + if k == 0: + # make broadcast + step = multiply, [var1, var2], var2 + plan.add_step(step) + # Contract and no join, turn into a dot + elif j1 + j2 == 0 and k == 1: + step = unsqueeze, [var1], var1, [-2] + plan.add_step(step) + step = unsqueeze, [var2], var2, [-1] + plan.add_step(step) + step = matmul, [var1, var2], var2 + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + elif j1 + j2 == 0 and not-1 in np.concatenate( + (op1_vshape[K], op2_vshape[K])): + assert all(op1_vshape[K] == op2_vshape[K]) + step = reshape, [var1], var1, list(op1_vshape[ + I]) + [1] + [np.prod(op1_vshape[K])] + plan.add_step(step) + step = reshape, [var2], var2, list(op2_vshape[ + I]) + [1] + [np.prod(op2_vshape[K])] + plan.add_step(step) + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + else: + step = multiply, [var1, var2], var2 + plan.add_step(step) + reduce_dims = list(range(-k, 0)) + plan_reduce(plan, op2, reduce_dims, keepdim=False) + # Wrap up, updating auxiliary data # Updating g_mask for I and J axes - for i, ax in enumerate(I + J1 + J2): - op2_mask[ax] = (result_shape[i] > 1) + for ax in I + J1 + J2: + op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1 for ax in K: op2_mask[ax] = False @@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): for ax in I + J1 + J2: op2_view[ax], dim = dim, dim + 1 + g_view[op2] = list(op2_view) + def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast): @@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast): return plan -@dygraph_only def einsum(equation, *operands): r""" einsum(equation, *operands) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index d24b64bf661778618a93b867a296b0b9c23c5fe8..70dea65b7699b413f0dc5fc8d68599229beb3078 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -141,6 +141,14 @@ output : Tensor invoke : full_like(x, 1, dtype, place) +- api : pool2d + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) + output : Tensor(out) + infer_meta : + func : PoolInferMeta + kernel: + func : pool2d + - api : reshape args : (Tensor x, ScalarArray shape) output : Tensor(out)