diff --git a/cmake/generic.cmake b/cmake/generic.cmake index f7b7c73d4deed1affa69a794ea6b940d12e0a589..9cdc0e127c8d5177c819a5b4a97378e3c6eb77ab 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -195,6 +195,7 @@ function(create_dummy_static_lib TARGET_NAME) # the dummy target would be consisted of limit size libraries set(limit ${merge_LIMIT}) list(LENGTH merge_LIBS libs_len) + message("libs_len ${libs_len}") foreach(lib ${merge_LIBS}) list(APPEND merge_list ${lib}) list(LENGTH merge_list listlen) diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index b54f45363a00daa71b6de6ac87d9657fa0ff2c29..26b6fce08a40c6a0334f11ac9dbb91778c9aabf5 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -739,6 +739,14 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase): self.backward_returns_list, ) = ParseYamlBackward(backward_args_str, backward_returns_str) + # Remove the output which is intermediate + if 'intermediate' in grad_api_contents: + backward_returns_list_new = [] + for return_item in self.backward_returns_list: + if return_item[0] not in grad_api_contents['intermediate']: + backward_returns_list_new.append(return_item) + self.backward_returns_list = backward_returns_list_new + def CollectForwardInfoFromBackwardContents(self): backward_forward_str = self.backward_forward_str @@ -1979,7 +1987,6 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase): fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n" inplace_grad_input_str = "" - inplaced_tensor_wrapper = False inplace_check_str = "" optional_inplace_var_name = [] # Grad Ins from TensorWrappers diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 3c2e8bf85a7992a9759f265dccaea4d5056889bc..06323119a7dc64a54682b201ec30d2c0cf03872b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -105,6 +105,7 @@ pass_library(delete_fill_constant_op_pass inference) pass_library(constant_folding_pass inference) pass_library(auto_mixed_precision_pass inference) pass_library(conv2d_fusion_layout_transfer_pass inference) +pass_library(silu_fuse_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) pass_library(skip_layernorm_fuse_pass base) @@ -429,10 +430,6 @@ if(WITH_MKLDNN) test_conv_batch_norm_mkldnn_fuse_pass SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc DEPS ${TEST_CONV_BN_PASS_DEPS}) - cc_test( - test_scale_matmul_fuse_pass - SRCS mkldnn/scale_matmul_fuse_pass_tester.cc - DEPS scale_matmul_fuse_pass) cc_test( test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc index efed7dd6e637bc7e9421b3d4afb2090a1c47336c..dd4e0735600bec2a560a00faf338a944bacf702e 100644 --- a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc +++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc @@ -143,10 +143,16 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const { static_cast(Get("model_precision")) == phi::DataType::FLOAT16 || Get("enable_gpu_mixed"); - bool cutlass_enable = false; + bool cutlass_enable = Get("use_cutlass"); #ifdef PADDLE_WITH_CUTLASS - cutlass_enable = true; + const auto &prop = platform::GetDeviceProperties(Get("gpu_device_id")); + int sm_version = prop.major * 10 + prop.minor; + // Now we only implement cutlass kernel on SM75. + if (sm_version == 75) { + } else { + cutlass_enable = false; + } #endif if (!(is_fp16_precision && cutlass_enable)) return; @@ -184,10 +190,21 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const { auto filter_names = op_node->Op()->Input("Filter"); auto act_type = op_node->Op()->GetAttrIfExists("activation"); constexpr int CUTLASS_NHWC_ALIGNMENT = 8; - std::unordered_set cutlass_act_set = { + // conv2d_fusion has two forms: conv + bias + act, conv + bias + + // elmentwise_add + act. + std::unordered_set cutlass_cba_act_set = { "relu", "swish", "identity", "leaky_relu"}; - if (!cutlass_act_set.count(act_type)) { - return false; + std::unordered_set cutlass_cbaa_act_set = {"relu"}; + bool is_residual = op_node->Op()->Input("ResidualData").size() >= 1UL; + + if (is_residual) { + if (!cutlass_cbaa_act_set.count(act_type)) { + return false; + } + } else { + if (!cutlass_cba_act_set.count(act_type)) { + return false; + } } // If filter's channel is not multiple of 8, conv2d_fusion not run at nhwc. diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc index 324f707af1ec5d9e1a9a6fcc2d4704026e84df2c..021d372c2c89aa64863b8731430360e923285720 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc @@ -32,7 +32,11 @@ void AddVarToScope(Scope* param_scope, const DDim& dims) { auto* tensor = param_scope->Var(name)->GetMutable(); tensor->Resize(dims); - tensor->mutable_data(platform::CPUPlace()); + auto* data = tensor->mutable_data(platform::CPUPlace()); + int64_t numel = tensor->numel(); + for (int64_t i = 0; i < numel; ++i) { + data[i] = 0; + } } Scope* CreateParamScope() { diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index 2f527ff1e707bb986aef0da8d721ab8920d6d048..ba18b04d9d04576532a786e940efc02b6d349fd3 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -167,14 +167,19 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { phi::DataType::FLOAT16 || Get("enable_gpu_mixed"); constexpr int CUTLASS_NHWC_ALIGNMENT = 8; - if (is_fp16_precision) { + bool cutlass_enable = Get("use_cutlass"); + if (is_fp16_precision && cutlass_enable) { #ifdef PADDLE_WITH_CUTLASS - // cutlass now support these activations - // cutlass_act_set.insert("swish"); - // cutlass_act_set.insert("relu"); - // cutlass_act_set.insert("identity"); - // cutlass_act_set.insert("leaky_relu"); - + const auto& prop = platform::GetDeviceProperties(Get("gpu_device_id")); + int sm_version = prop.major * 10 + prop.minor; + // Now we only implement cutlass kernel on SM75. + if (sm_version == 75) { + // Cutlass now support these cba activations. + cutlass_act_set.insert("swish"); + cutlass_act_set.insert("relu"); + cutlass_act_set.insert("identity"); + cutlass_act_set.insert("leaky_relu"); + } all_act_set.insert(cutlass_act_set.begin(), cutlass_act_set.end()); #endif } @@ -198,8 +203,8 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { auto* filter_var = scope->FindLocalVar(conv_filter->Name()); auto* filter_tensor = filter_var->GetMutable(); CHECK_EQ(filter_tensor->dims().size() == 4UL, true); - // when this conv2d_fusion problem size is not supported by cutlass and not - // supported by cuDNN, we should not apply this pass + // When this conv2d_fusion problem size is not supported by cutlass and not + // supported by cuDNN, we should not apply this pass. int oc = filter_tensor->dims()[0]; int ic = filter_tensor->dims()[1]; bool cutlass_can_fuse = oc % CUTLASS_NHWC_ALIGNMENT == 0 && diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc deleted file mode 100644 index ed6e63615f7c35f22264aea00ca7dc7d80b97abf..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -void SetOp(ProgramDesc* prog, - const std::string& type, - const std::vector& inputs, - const std::vector& outputs, - float scale = 1.0f, - float bias = 0.0f) { - auto* op = prog->MutableBlock(0)->AppendOp(); - - op->SetType(type); - if (type == "scale") { - op->SetInput("X", {inputs[0]}); - op->SetAttr("scale", scale); - op->SetAttr("bias", bias); - } else if (type == "matmul") { - op->SetAttr("transpose_X", false); - op->SetAttr("transpose_Y", false); - op->SetInput("X", {inputs[0]}); - if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); - op->SetAttr("alpha", scale); - } else { - FAIL() << "Unexpected operator type."; - } - op->SetOutput("Out", {outputs[0]}); -} - -// a->scale->b -// (b,c)->matmul->d -ProgramDesc BuildProgramDesc(float scale, float bias, float alpha) { - ProgramDesc prog; - - for (auto& v : std::vector({"a", "b", "c", "d"})) { - prog.MutableBlock(0)->Var(v); - } - SetOp(&prog, "scale", {"a"}, {"b"}, scale, bias); - SetOp(&prog, "matmul", {"b", "c"}, {"d"}, alpha); - return prog; -} - -void MainTest(const ProgramDesc& prog, - int removed_nodes_count, - const std::vector scale_in_out, - const std::vector matmul_in_out, - float alpha) { - std::unique_ptr graph(new ir::Graph(prog)); - int original_nodes_num = graph->Nodes().size(); - auto pass = PassRegistry::Instance().Get("scale_matmul_fuse_pass"); - graph.reset(pass->Apply(graph.release())); - int current_nodes_num = graph->Nodes().size(); - - for (auto* node : graph->Nodes()) { - if (node->IsOp()) { - auto* op = node->Op(); - if (op->Type() == "scale") { - EXPECT_EQ(op->Input("X")[0], scale_in_out[0]); - EXPECT_EQ(op->Output("Out")[0], scale_in_out[1]); - } else if (op->Type() == "matmul") { - EXPECT_EQ(op->Input("X")[0], matmul_in_out[0]); - EXPECT_EQ(op->Input("Y")[0], matmul_in_out[1]); - EXPECT_EQ(op->Output("Out")[0], matmul_in_out[2]); - EXPECT_EQ(op->GetAttrIfExists("alpha"), alpha); - } - } - } - EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num); -} - -TEST(ScaleMatmulFusePass, scale_matmul_with_no_bias) { - auto bias = 0.0f; - auto scale = 2.34f; - auto alpha = 3.45f; - int removed_nodes_count = 2; - MainTest(BuildProgramDesc(scale, bias, alpha), - removed_nodes_count, - {}, - {"a", "c", "d"}, - scale * alpha); -} - -TEST(ScaleMatmulFusePass, scale_matmul_with_bias) { - auto bias = 1.0f; - auto scale = 2.34f; - auto alpha = 3.45f; - int removed_nodes_count = 0; - MainTest(BuildProgramDesc(scale, bias, alpha), - removed_nodes_count, - {"a", "b"}, - {"b", "c", "d"}, - alpha); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(scale_matmul_fuse_pass); diff --git a/paddle/fluid/framework/ir/silu_fuse_pass.cc b/paddle/fluid/framework/ir/silu_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..05817968b45c6fccdf0733ee67d7fa881c7ee99c --- /dev/null +++ b/paddle/fluid/framework/ir/silu_fuse_pass.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/silu_fuse_pass.h" +#include +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SiluFusePass::ApplyImpl(ir::Graph* graph) const { + // This pass is used for cutlass, because cutlass can fuse conv + bias + silu + bool cutlass_enable = Get("use_cutlass"); + if (!cutlass_enable) { + return; + } + + const std::string pattern_name = "silu_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + auto* sigmoid_in = gpd.mutable_pattern()->NewNode("sigmoid_in"); + auto sigmoid_op = + gpd.mutable_pattern()->NewNode("sigmoid_op")->assert_is_op("sigmoid"); + auto sigmoid_out = gpd.mutable_pattern() + ->NewNode("sigmoid_out") + ->assert_is_op_output("sigmoid") + ->AsIntermediate(); + auto elementwise_mul_op = gpd.mutable_pattern() + ->NewNode("elementwise_mul_op") + ->assert_is_op("elementwise_mul"); + + auto elementwise_mul_out = gpd.mutable_pattern() + ->NewNode("elementwise_mul_out") + ->assert_is_op_output("elementwise_mul") + ->AsOutput(); + + sigmoid_op->LinksFrom({sigmoid_in}).LinksTo({sigmoid_out}); + elementwise_mul_op->LinksFrom({sigmoid_in, sigmoid_out}) + .LinksTo({elementwise_mul_out}); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + Node* sigmoid_in_node = subgraph.at(sigmoid_in); + Node* sigmoid_op_node = subgraph.at(sigmoid_op); + Node* elementwise_mul_op_node = subgraph.at(elementwise_mul_op); + Node* elementwise_mul_out_node = subgraph.at(elementwise_mul_out); + + OpDesc new_desc; + new_desc.SetType("swish"); + new_desc.SetAttr("beta", 1.f); + new_desc.SetInput("X", {sigmoid_in_node->Name()}); + new_desc.SetOutput("Out", {elementwise_mul_out_node->Name()}); + new_desc.Flush(); + + std::unordered_set del_node_set; + del_node_set.insert(sigmoid_op_node); + del_node_set.insert(elementwise_mul_op_node); + GraphSafeRemoveNodes(graph, del_node_set); + + auto fused_node = graph->CreateOpNode(&new_desc); + IR_NODE_LINK_TO(sigmoid_in_node, fused_node); + IR_NODE_LINK_TO(fused_node, elementwise_mul_out_node); + }; + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(silu_fuse_pass, paddle::framework::ir::SiluFusePass); diff --git a/paddle/fluid/framework/ir/silu_fuse_pass.h b/paddle/fluid/framework/ir/silu_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..6098c6c9b0bcebb49ca92cfdbe3bd62f50653f34 --- /dev/null +++ b/paddle/fluid/framework/ir/silu_fuse_pass.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class SiluFusePass : public FusePassBase { + public: + virtual ~SiluFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 991476dff55b9a4c7e8ff3c1093e18e0b3380fc1..dcb822afb4cadbfeac111e53cbd3cecedf05c77d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1603,11 +1603,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif - auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx); // using cache if (kernel_type_.get()) { dev_ctx = pool.Get(kernel_type_->place_); } + auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx); // TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU // device, it's ugly, and we will refactor in the future. @@ -2716,22 +2716,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( static_cast(-1); proto::VarType::Type data_type = dafault_data_type; - auto in_name_list = ctx.InNameList(); - if (Info().HasOpProtoAndChecker()) { - for (auto& attr : Info().Proto().attrs()) { - auto it = - std::find_if(in_name_list.begin(), - in_name_list.end(), - [&attr](const std::string* name) { - return attr.support_tensor() && *name == attr.name(); - }); - if (it != in_name_list.end()) { - in_name_list.erase(it); - } - } - } - - for (auto* name : in_name_list) { + for (auto* name : ctx.InNameList()) { if (ctx.InputSize(*name) == 1UL) { ParseInputDataType(ctx.InputVar(*name), *name, &data_type); } else { diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 002eb29b776ea083534f4db85c7ad8e2813356cd..f8a4df0617190c539dbf863e92e2bbe331dbdd43 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -202,6 +202,7 @@ struct Argument { // Passed from config. DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); + DECL_ARGUMENT_FIELD(use_cutlass, UseCutlass, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index c184d94ba7fdf7b9890313a4f8c068c9066483fd..ed82dfbaa04e7d95b3306c9e4fe37ead37bba1f4 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -52,6 +52,7 @@ void IRPassManager::CreatePasses(Argument *argument, for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen())); + pass->Set("use_cutlass", new bool(argument->use_cutlass())); pass->Set("with_interleaved", new bool(argument->tensorrt_with_interleaved())); pass->Set("tensorrt_transformer_posid", @@ -80,6 +81,10 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("optim_shape_tensor", new std::map>()); + // This gpu_device_id is used by some fp16 precision passes, so move it + // here. + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + // tuned trt dynamic_shape pass->Set("trt_tuned_dynamic_shape", new bool(argument->tensorrt_tuned_dynamic_shape())); @@ -198,7 +203,6 @@ void IRPassManager::CreatePasses(Argument *argument, "model_opt_cache_dir", new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } - pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("use_static_engine", new bool(use_static_engine)); pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector())); diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc old mode 100644 new mode 100755 index 2ff82986e945caf3ecd0ee91bac02c9a9ad48272..40a8c5ce66a2a5b7c5f54784abdcbdc2c9e3e531 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -222,6 +222,51 @@ void MakeSimpleReusePlan( } } +// Remove the inplace operation from the plan because it does not support memory +// reuse +void DelInplaceOpFromPlan( + Graph* graph, + std::unordered_map* node2cluster, + int sort_kind) { + auto topo_nodes = TopologyVarientSort( + *graph, static_cast(sort_kind)); + for (auto* op_node : topo_nodes) { + if (!op_node->IsOp()) continue; + auto input_tensors = op_node->inputs; + auto output_tensors = op_node->outputs; + + std::unordered_set in_names; + for (const Node* node : input_tensors) { + if (!node->Var()) continue; + if (node->Var()->Persistable()) continue; + std::string var = node->Name(); + in_names.insert(var); + } + + for (const Node* node : output_tensors) { + if (!node->Var()) continue; + if (node->Var()->Persistable()) continue; + std::string var = node->Name(); + if (in_names.find(var) != in_names.end()) { + // delete key + if (node2cluster->count(var)) { + node2cluster->erase(var); + } + // delete value + std::string tmp_name = ""; + for (auto it = node2cluster->begin(); it != node2cluster->end(); ++it) { + if (it->second == var) { + if (tmp_name == "") { + tmp_name = it->first; + } + it->second = tmp_name; + } + } + } + } + } +} + // NOTE The optimized opdesc doesn't match ir::Graph. void UpdateOpDescsByReuse( Graph* graph, @@ -324,6 +369,7 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { CollectLifeCycle(graph, &lifecycles, sort_kind); CollectVarMemorySize(graph, &space_table); MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size); + DelInplaceOpFromPlan(graph, &node2cluster, sort_kind); auto* pass_res_info = PassResultInfoForRuntime::Instance(); pass_res_info->Set( diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 246cfc44e81dc4b7e2d556cceb7901f07bffb0b6..5d71c7cee1d4356b3475eca7a5187175b3b16165 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -115,6 +115,17 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } +void AnalysisConfig::Exp_EnableUseCutlass() { +#if defined(PADDLE_WITH_CUTLASS) + use_cutlass_ = true; +#else + LOG(ERROR) << "Please compile with cutlass to EnableUseCutlass()"; + use_cutlass_ = false; +#endif + + Update(); +} + void AnalysisConfig::SetExecStream(void *stream) { PADDLE_ENFORCE_NOT_NULL( stream, @@ -389,6 +400,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_fc_padding_); // GPU related. CP_MEMBER(use_gpu_); + CP_MEMBER(use_cutlass_); CP_MEMBER(use_external_stream_); CP_MEMBER(exec_stream_); CP_MEMBER(use_cudnn_); @@ -1249,6 +1261,7 @@ std::string AnalysisConfig::Summary() { // gpu info os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"}); if (use_gpu_) { + os.InsertRow({"use_cutlass", use_cutlass_ ? "true" : "false"}); os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)}); os.InsertRow({"enable_gpu_mixed", std::to_string(enable_gpu_mixed_)}); os.InsertRow({"memory_pool_init_size", diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2fe3dbe13e71afa1ca4ead1495490507cc4a16bc..0fb11279ebdf9cb78b316acfcaa2e08d73048b6b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1088,6 +1088,7 @@ void AnalysisPredictor::PrepareArgument() { // Init std::unique_ptr argument_. argument_.reset(new Argument); argument_->SetUseGPU(config_.use_gpu()); + argument_->SetUseCutlass(config_.use_cutlass_); argument_->SetUseFcPadding(config_.use_fc_padding()); argument_->SetGPUDeviceId(config_.gpu_device_id()); argument_->SetEnableIrOptim(config_.enable_ir_optim_); @@ -2396,6 +2397,7 @@ USE_TRT_CONVERTER(cast) USE_TRT_CONVERTER(recover_padding) USE_TRT_CONVERTER(remove_padding) USE_TRT_CONVERTER(equal); +USE_TRT_CONVERTER(not_equal); USE_TRT_CONVERTER(top_k) USE_TRT_CONVERTER(top_k_v2) USE_TRT_CONVERTER(range) diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 41eea1fb98c319b4a70e2a961194df55fee4f35d..0adeaf356de0ac2a131de1e8845a2e6d66a0b44b 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -395,6 +395,12 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_gpu() const { return use_gpu_; } /// + /// \brief When running the fp16 model on Nvidia GPU, you can also try running + /// your model on cutlass. + /// + void Exp_EnableUseCutlass(); + /// + /// /// \brief A boolean state telling whether the XPU is turned on. /// /// \return bool Whether the XPU is turned on. @@ -1047,6 +1053,7 @@ struct PD_INFER_DECL AnalysisConfig { // GPU related. bool use_gpu_{false}; + bool use_cutlass_{false}; int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool enable_gpu_mixed_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 46eca6df552c6fd7705a2c1e8a70d75a28c6d8e7..b4018d883a028d11b116e8d33d9d846eafff807e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -164,6 +164,7 @@ const std::vector kLiteSubgraphPasses({ const std::vector kGpuLowerPrecisionPasses{ "identity_scale_op_clean_pass", "simplify_with_basic_ops_pass", + "silu_fuse_pass", "delete_quant_dequant_linear_op_pass", "delete_weight_dequant_linear_op_pass", "map_depthwise_conv_to_conv_pass", @@ -172,6 +173,7 @@ const std::vector kGpuLowerPrecisionPasses{ "conv_elementwise_add_act_fuse_pass", "conv_elementwise_add2_act_fuse_pass", "conv_elementwise_add_fuse_pass", + "conv2d_fusion_layout_transfer_pass", "multihead_matmul_fuse_pass_v2", "fused_multi_transformer_encoder_pass", "fused_multi_transformer_decoder_pass", @@ -216,6 +218,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "delete_weight_dequant_linear_op_pass", // "map_depthwise_conv_to_conv_pass", // "constant_folding_pass", // + "silu_fuse_pass", // "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "embedding_eltwise_layernorm_fuse_pass", // @@ -250,7 +253,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { #endif // "transpose_flatten_concat_fuse_pass", // "constant_folding_pass", // - "auto_mixed_precision_pass", // + "conv2d_fusion_layout_transfer_pass", // + "auto_mixed_precision_pass" }); use_gpu_ = true; diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index b8f9b22fc7b2b1d184f31c13cf9e752443b0510e..314e5390bde8272c3ba585c06913230baef0a3cc 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -142,7 +142,8 @@ void ConvertConv2d(TensorRTEngine* engine, layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose" " layer failed.")); - layer->setStride(nv_strides); + layer->setStrideNd(nv_strides); + layer->setPrePadding(nv_pre_paddings); if (output_padding.size() > 0) { nv_post_paddings.d[0] -= output_padding[0]; @@ -189,7 +190,7 @@ class Conv2dOpConverter : public OpConverter { TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* { auto* layer = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, + ConvolutionNd, *inputs, n_output, ksize, diff --git a/paddle/fluid/inference/tensorrt/convert/equal_op.cc b/paddle/fluid/inference/tensorrt/convert/equal_op.cc index 3a9627dc99a5c36800d2dea7aff152630b0d8706..d1b4b1c08c81b5100fe74471e61919bf74ab343e 100644 --- a/paddle/fluid/inference/tensorrt/convert/equal_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/equal_op.cc @@ -35,7 +35,6 @@ class EqualOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { -#if IS_TRT_VERSION_GE(8000) framework::OpDesc op_desc(op, nullptr); nvinfer1::ILayer* layer = nullptr; @@ -79,11 +78,62 @@ class EqualOpConverter : public OpConverter { layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL); RreplenishLayerAndOutput(layer, "equal", {output_name}, test_mode); -#else - PADDLE_THROW( - platform::errors::Fatal("ElementWise Equal Operation is only supported " - "on TRT 8 or higher version.")); -#endif + } +}; + +class NotEqualOpConverter : public OpConverter { + public: + NotEqualOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + nvinfer1::ILayer* layer = nullptr; + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + nvinfer1::Dims dims_y = Y->getDimensions(); + + int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis")); + if (axis < 0) { + axis = std::abs(dims_x.nbDims - dims_y.nbDims); + } + auto output_name = op_desc.Output("Out")[0]; + nvinfer1::IShuffleLayer* expand_layer = nullptr; + if (dims_x.nbDims > dims_y.nbDims) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = dims_x.nbDims; + for (int i = 0; i < expand_shape.nbDims; i++) { + expand_shape.d[i] = 1; + } + for (int i = 0; i < dims_y.nbDims; i++) { + expand_shape.d[i + axis] = dims_y.d[i]; + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y); + expand_layer->setReshapeDimensions(expand_shape); + Y = expand_layer->getOutput(0); + } else if (dims_x.nbDims < dims_y.nbDims) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = dims_y.nbDims; + for (int i = 0; i < expand_shape.nbDims; i++) { + expand_shape.d[i] = 1; + } + for (int i = 0; i < dims_x.nbDims; i++) { + expand_shape.d[i + axis] = dims_x.d[i]; + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } + + layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL); + + layer = TRT_ENGINE_ADD_LAYER( + engine_, Unary, *layer->getOutput(0), nvinfer1::UnaryOperation::kNOT); + + RreplenishLayerAndOutput(layer, "not_equal", {output_name}, test_mode); } }; @@ -92,3 +142,4 @@ class EqualOpConverter : public OpConverter { } // namespace paddle REGISTER_TRT_OP_CONVERTER(equal, EqualOpConverter); +REGISTER_TRT_OP_CONVERTER(not_equal, NotEqualOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index e5e344e16cbb34379945d3e45fff64deda3800b8..66bfe56f355d9026bf6f648a376da06f147e6a45 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -119,24 +119,21 @@ struct SimpleOpTypeSetTeller : public Teller { #endif } - // In static shape mode in TRT, we can't allow that op's input is a - // 1D-tensor So we filter it here. Some op like elementwise having "Y" too, - // but that is dealt with in the specified op, here just the common case + // In static shape in Paddle-TRT, we can't allow that one op has a + // 1D intermediate tensor as input. if (!with_dynamic_shape) { - std::string X_name; auto inputs = desc.Inputs(); - if (inputs.count("X") && !desc.Input("X").empty()) { - X_name = desc.Input("X")[0]; - } else if (inputs.count("Input") && !desc.Input("Input").empty()) { - X_name = desc.Input("Input")[0]; - } - auto* block = desc.Block(); - if (block) { - auto* x_var_desc = block->FindVar(X_name); - // Can't get feed op's TensorDesc - if (op_type != "feed" && x_var_desc && !x_var_desc->Persistable()) { - const auto x_shape = x_var_desc->GetShape(); - if (x_shape.size() == 1) return false; + for (auto iter : inputs) { + for (auto var_name : iter.second) { + auto* block = desc.Block(); + if (block) { + auto* var_desc = block->FindVar(var_name); + // Can't get feed op's TensorDesc + if (op_type != "feed" && var_desc && !var_desc->Persistable()) { + const auto shape = var_desc->GetShape(); + if (shape.size() == 1) return false; + } + } } } } @@ -2341,7 +2338,7 @@ struct SimpleOpTypeSetTeller : public Teller { } #endif - if (op_type == "equal") { + if (op_type == "equal" || op_type == "not_equal") { #if !IS_TRT_VERSION_GE(8000) VLOG(3) << "compare is not supported when TensorRT < 8.0"; return false; @@ -2493,6 +2490,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_max", "elementwise_floordiv", "equal", + "not_equal", "less_than", "greater_than", "logical_or", @@ -2639,6 +2637,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_max", "elementwise_floordiv", "equal", + "not_equal", "less_than", "greater_than", "logical_or", diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index 27440c9408baac4a2d999cf8785a395585d16047..022c21a205dd4ace957c330afbb17fc6378d278f 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -330,3 +330,13 @@ REGISTER_OPERATOR( ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); + +// This op is used by cutlass, conv2d_fusion_cutlass is a intermediate op +// produced by conv2d_fusion_layout_transfer_pass. +REGISTER_OPERATOR( + conv2d_fusion_cutlass, + ops::Conv2DFusionOp, + ops::Conv2DFusionOpMaker, + ops::ConvOpInferVarType, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py index d7121a2aeb567ab31e3ba17e21460f8be0af1f48..a0f4cfacfde38f25b5aa30f4482206095c8e236e 100644 --- a/paddle/fluid/operators/generator/generate_op.py +++ b/paddle/fluid/operators/generator/generate_op.py @@ -131,9 +131,10 @@ def process_int_array(op_item, int_array_configs): ) if attr_item['is_support_tensor']: attr_item['typename'] = ( - data_type_map[int_array_config['data_type']] + 'int[]' if 'data_type' in int_array_config - else 'std::vector' + and int_array_config['data_type'] == 'int' + else 'int64_t[]' ) else: attr_item['data_type'] = ( @@ -153,21 +154,95 @@ def process_int_array(op_item, int_array_configs): # replace name of op and params for OpMaker -def replace_compat_name(op_op_map, forward_op_dict, backward_op_dict): - def get_op_and_op_name(op_item): +def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict): + def get_phi_and_fluid_op_name(op_item): names = op_item.split('(') if len(names) == 1: return names[0].strip(), names[0].strip() else: return names[0].strip(), names[1].split(')')[0].strip() - def update_op_attr_name(attrs, attrs_alias_map): - for attr_item in attrs: - if attr_item['name'] in attrs_alias_map: - attr_item['name'] = attrs_alias_map[attr_item['name']] + def update_op_param_name(op_args, args_alias_map): + for item in op_args: + if item['name'] in args_alias_map: + item['name'] = args_alias_map[item['name']] + + def update_grad_args_name(op_args, args_alias_map): + for item in op_args: + if ( + item['name'].endswith('_grad') + and item['name'][:-5] in args_alias_map + ): + args_alias_map[item['name']] = ( + args_alias_map[item['name'][:-5]] + '_grad' + ) + item['name'] = args_alias_map[item['name'][:-5]] + '_grad' + + def get_param_list_alias(param_list, args_map): + return [ + args_map[param] if param in args_map else param + for param in param_list + ] - for op_args in op_op_map: - new_op_name, op_name = get_op_and_op_name(op_args['op']) + def update_common_params_name( + op_item, args_name_map, scalar_configs, int_array_configs + ): + if 'inplace' in op_item and op_item['inplace']: + inplace_map = {} + for key, val in op_item['inplace'].items(): + if key in args_map: + key = args_map[key] + if val in args_map: + val = args_map[val] + inplace_map[key] = val + op_item['inplace'] = inplace_map + if 'no_need_buffer' in op_item and op_item['no_need_buffer']: + op_item['no_need_buffer'] = get_param_list_alias( + op_item['no_need_buffer'], args_map + ) + + process_scalar(op_item, scalar_configs) + process_int_array(op_item, int_array_configs) + + if 'invoke' in op_item: + op_item['invoke']['args'] = [ + args_map[param.strip()] + if param.strip() in args_map + else param.strip() + for param in op_item['invoke']['args'].split(',') + ] + return + op_item['infer_meta']['param'] = get_param_list_alias( + op_item['infer_meta']['param'], args_name_map + ) + op_item['kernel']['param'] = get_param_list_alias( + op_item['kernel']['param'], args_name_map + ) + if op_item['kernel']['data_type']: + op_item['kernel']['data_type']['candidates'] = get_param_list_alias( + op_item['kernel']['data_type']['candidates'], args_name_map + ) + if op_item['kernel']['backend']: + op_item['kernel']['backend']['candidates'] = get_param_list_alias( + op_item['kernel']['backend']['candidates'], args_name_map + ) + if op_item['kernel']['layout']: + op_item['kernel']['layout']['candidates'] = get_param_list_alias( + op_item['kernel']['layout']['candidates'], args_name_map + ) + + def update_grad_op_compat_name(grad_op_item, args_name_map): + update_op_param_name(grad_op_item['inputs'], args_name_map) + update_op_param_name(grad_op_item['outputs'], args_name_map) + update_op_param_name(grad_op_item['attrs'], args_name_map) + update_op_param_name(grad_op_item['forward']['inputs'], args_name_map) + update_op_param_name(grad_op_item['forward']['outputs'], args_name_map) + update_op_param_name(grad_op_item['forward']['attrs'], args_name_map) + update_grad_args_name(grad_op_item['inputs'], args_map) + update_grad_args_name(grad_op_item['outputs'], args_map) + + for op_args in op_fluid_map_list: + new_op_name, op_name = get_phi_and_fluid_op_name(op_args['op']) if new_op_name not in forward_op_dict: continue forward_op_item = forward_op_dict[new_op_name] @@ -179,189 +254,102 @@ def replace_compat_name(op_op_map, forward_op_dict, backward_op_dict): scalar_configs = None int_array_configs = None - if 'scalar' in op_args: scalar_configs = op_args['scalar'] if 'int_array' in op_args: int_array_configs = op_args['int_array'] + if 'extra' in op_args and 'outputs' in op_args['extra']: + for out_item in forward_op_item['outputs']: + if out_item['name'] in op_args['extra']['outputs']: + out_item['is_extra'] = True - process_scalar(forward_op_item, scalar_configs) - process_int_array(forward_op_item, int_array_configs) + key_set = ['inputs', 'attrs', 'outputs'] + args_map = {} + for key in key_set: + if key in op_args: + args_map.update(op_args[key]) + for args_item in forward_op_item[key]: + if args_item['name'] in op_args[key]: + if ( + scalar_configs + and args_item['name'] in scalar_configs + ): + scalar_configs[ + op_args[key][args_item['name']] + ] = scalar_configs[args_item['name']] + if ( + int_array_configs + and args_item['name'] in int_array_configs + ): + int_array_configs[ + op_args[key][args_item['name']] + ] = int_array_configs[args_item['name']] + args_item['name'] = op_args[key][args_item['name']] + if has_backward: + for args_item in backward_op_item['forward'][key]: + if args_item['name'] in op_args[key]: + args_item['name'] = op_args[key][args_item['name']] + forward_op_item["attr_dict"] = to_named_dict(forward_op_item["attrs"]) + update_common_params_name( + forward_op_item, args_map, scalar_configs, int_array_configs + ) + + if has_backward: + update_grad_op_compat_name(backward_op_item, args_map) + update_common_params_name( + backward_op_item, args_map, scalar_configs, int_array_configs + ) + backward_op_item["attr_dict"] = to_named_dict( + backward_op_item["attrs"] + ) + + if 'backward' not in op_args: + continue - if 'backward' in op_args and has_backward: backward_op_list = op_args['backward'].split(',') - _, bw_op_name = get_op_and_op_name(backward_op_list[0]) + _, bw_op_name = get_phi_and_fluid_op_name(backward_op_list[0]) forward_op_item['backward'] = bw_op_name backward_op_item['op_name'] = bw_op_name - process_scalar(backward_op_item, scalar_configs) - process_int_array(backward_op_item, int_array_configs) - # for double grad if len(backward_op_list) > 1: ( - new_double_grad_op_name, + phi_double_grad_op_name, double_grad_op_name, - ) = get_op_and_op_name(backward_op_list[1]) - double_grad_item = backward_op_dict[new_double_grad_op_name] + ) = get_phi_and_fluid_op_name(backward_op_list[1]) + double_grad_item = backward_op_dict[phi_double_grad_op_name] backward_op_item['backward'] = double_grad_op_name double_grad_item['op_name'] = double_grad_op_name - if 'attrs' in op_args: - update_op_attr_name( - double_grad_item['attrs'], op_args['attrs'] - ) - update_op_attr_name( - double_grad_item['forward']['attrs'], op_args['attrs'] - ) - - process_scalar(double_grad_item, scalar_configs) - process_int_array(double_grad_item, int_array_configs) + update_grad_op_compat_name(double_grad_item, args_map) + update_common_params_name( + double_grad_item, + args_map, + scalar_configs, + int_array_configs, + ) + double_grad_item["attr_dict"] = to_named_dict( + double_grad_item["attrs"] + ) # for triple grad if len(backward_op_list) > 2: ( - new_triple_grad_op_name, + phi_triple_grad_op_name, triple_grad_op_name, - ) = get_op_and_op_name(backward_op_list[2]) - triple_grad_item = backward_op_dict[new_triple_grad_op_name] + ) = get_phi_and_fluid_op_name(backward_op_list[2]) + triple_grad_item = backward_op_dict[phi_triple_grad_op_name] double_grad_item['backward'] = triple_grad_op_name triple_grad_item['op_name'] = triple_grad_op_name - if 'attrs' in op_args: - update_op_attr_name( - triple_grad_item['attrs'], op_args['attrs'] - ) - update_op_attr_name( - triple_grad_item['forward']['attrs'], - op_args['attrs'], - ) - - process_scalar(triple_grad_item, scalar_configs) - process_int_array(triple_grad_item, int_array_configs) - - key_set = ['inputs', 'attrs', 'outputs'] - args_map = {} - for key in key_set: - if key in op_args: - args_map.update(op_args[key]) - for args_item in forward_op_item[key]: - if args_item['name'] in op_args[key]: - args_item['name'] = op_args[key][args_item['name']] - if has_backward: - for args_item in backward_op_item['forward'][key]: - if args_item['name'] in op_args[key]: - args_item['name'] = op_args[key][args_item['name']] - forward_op_item['infer_meta']['param'] = [ - args_map[param] if param in args_map else param - for param in forward_op_item['infer_meta']['param'] - ] - forward_op_item['kernel']['param'] = [ - args_map[param] if param in args_map else param - for param in forward_op_item['kernel']['param'] - ] - if forward_op_item['kernel']['data_type']: - forward_op_item['kernel']['data_type']['candidates'] = [ - args_map[param] if param in args_map else param - for param in forward_op_item['kernel']['data_type'][ - 'candidates' - ] - ] - if forward_op_item['kernel']['backend']: - forward_op_item['kernel']['backend']['candidates'] = [ - args_map[param] if param in args_map else param - for param in forward_op_item['kernel']['backend']['candidates'] - ] - if forward_op_item['kernel']['layout']: - forward_op_item['kernel']['layout']['candidates'] = [ - args_map[param] if param in args_map else param - for param in forward_op_item['kernel']['layout']['candidates'] - ] - if forward_op_item['inplace']: - inplace_map = {} - for key, val in forward_op_item['inplace'].items(): - if key in args_map: - key = args_map[key] - if val in args_map: - val = args_map[val] - inplace_map[key] = val - forward_op_item['inplace'] = inplace_map - - if has_backward: - for args_item in backward_op_item['inputs']: - if args_item['name'] in args_map: - args_item['name'] = args_map[args_item['name']] - elif ( - args_item['name'].endswith('_grad') - and args_item['name'][:-5] in args_map - ): - args_map[args_item['name']] = ( - args_map[args_item['name'][:-5]] + '_grad' + update_grad_op_compat_name(triple_grad_item, args_map) + update_common_params_name( + triple_grad_item, + args_map, + scalar_configs, + int_array_configs, ) - args_item['name'] = args_map[args_item['name']] - for args_item in backward_op_item['attrs']: - if args_item['name'] in args_map: - args_item['name'] = args_map[args_item['name']] - for args_item in backward_op_item['outputs']: - if ( - args_item['name'].endswith('_grad') - and args_item['name'][:-5] in args_map - ): - args_map[args_item['name']] = ( - args_map[args_item['name'][:-5]] + '_grad' + triple_grad_item["attr_dict"] = to_named_dict( + triple_grad_item["attrs"] ) - args_item['name'] = args_map[args_item['name']] - - if 'invoke' in backward_op_item: - backward_op_item['invoke']['args'] = [ - args_map[param.strip()] - if param.strip() in args_map - else param.strip() - for param in backward_op_item['invoke']['args'].split(',') - ] - continue - - backward_op_item['infer_meta']['param'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['infer_meta']['param'] - ] - backward_op_item['kernel']['param'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['kernel']['param'] - ] - if backward_op_item['kernel']['data_type']: - backward_op_item['kernel']['data_type']['candidates'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['kernel']['data_type'][ - 'candidates' - ] - ] - if backward_op_item['kernel']['backend']: - backward_op_item['kernel']['backend']['candidates'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['kernel']['backend'][ - 'candidates' - ] - ] - if backward_op_item['kernel']['layout']: - backward_op_item['kernel']['layout']['candidates'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['kernel']['layout'][ - 'candidates' - ] - ] - if backward_op_item['no_need_buffer']: - backward_op_item['no_need_buffer'] = [ - args_map[param] if param in args_map else param - for param in backward_op_item['no_need_buffer'] - ] - if backward_op_item['inplace']: - inplace_map = {} - for key, val in backward_op_item['inplace'].items(): - if key in args_map: - key = args_map[key] - if val in args_map: - val = args_map[val] - inplace_map[key] = val - backward_op_item['inplace'] = inplace_map def process_invoke_op(forward_op_dict, backward_op_dict): @@ -372,6 +360,7 @@ def process_invoke_op(forward_op_dict, backward_op_dict): args_index = 0 if invoke_op in forward_op_dict: reuse_op = forward_op_dict[invoke_op] + bw_op['invoke']['func'] = reuse_op['op_name'] bw_op['invoke']['inputs'] = [] bw_op['invoke']['attrs'] = [] bw_op['invoke']['outputs'] = [] @@ -430,14 +419,14 @@ def main( forward_op_dict[op_version['op']]['version'] = op_version['version'] with open(op_compat_yaml_path, "rt") as f: - op_op_map = yaml.safe_load(f) + op_fluid_map_list = yaml.safe_load(f) for op in ops: op['op_name'] = op['name'] for bw_op in backward_ops: bw_op['op_name'] = bw_op['name'] - replace_compat_name(op_op_map, forward_op_dict, backward_op_dict) + replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict) # prepare for invoke case process_invoke_op(forward_op_dict, backward_op_dict) diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2 index 0b49721afcc9e0f7e15b2d77dba8f527c59a86ee..b28c8bdc1a297891945626b5f4d009f7d41e77cc 100644 --- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 +++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2 @@ -54,6 +54,10 @@ AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name .AsIntermediate() {%- endif %} + {%- if "is_extra" in output and output["is_extra"] %} + + .AsExtra() + {%- endif %} {%- endmacro %} {# add attribute, and process default value if needed #} @@ -115,7 +119,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum paddle::small_vector attrs; {% for attr in op["attrs"]%} {% filter indent(2)%} - {{get_an_attr(attr)}} + {{get_an_attr(attr, kernel_args)}} {% endfilter %} {% endfor %} {{get_output_list(op["outputs"], kernel_args)}}; @@ -170,7 +174,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum paddle::small_vector attrs; {% for attr in op["attrs"]%} {% filter indent(2)%} - {{get_an_attr(attr)}} + {{get_an_attr(attr, kernel_args)}} {% endfilter %} {% endfor %} {{get_output_list(op["outputs"], kernel_args)}}; @@ -209,8 +213,9 @@ paddle::small_vector inputs { } {%- endmacro %} -{% macro get_an_attr(attr) %}{# inline #} +{% macro get_an_attr(attr, kernel_args) %}{# inline #} {% set typename = attr["typename"] %} +{%- if attr["name"] in kernel_args %} {% set name = attr["name"] %} {% if typename is scalar %}{# scalar correspond to a dispensable input and an attr in opmaker #} attrs.emplace_back(ctx.HasInput("{{attr | to_scalar_tensor_name}}") ? "{{attr | to_scalar_tensor_name}}" : "{{name}}"); @@ -236,6 +241,7 @@ attrs.emplace_back( {%- else %} attrs.emplace_back("{{name}}"); {%- endif %} +{%- endif %} {%- endmacro %} {% macro get_output_list(outputs, kernel_args) %}{# inline #} @@ -502,10 +508,9 @@ OutputGrad({{name_in_forward_orig | to_opmaker_name}}) {% set name_in_forward = name[:-5] %} {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} InputGrad({{name_in_forward_orig | to_opmaker_name}}) - {%- elif (name | to_input_name) in input_names %} - {% set name_in_forward = name | to_input_name %} - {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} -InputGrad({{name | to_input_name | to_opmaker_name}}) + {%- elif (name) in input_names %} + {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%} +Input({{name | to_opmaker_name}}) {%- endif %} {%- endmacro %} diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 4e6a10a912a8888337b44363ad166bb1dcc043bc..2951091508dd6d82af7ea0ccf5817e088d1f42c9 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -30,6 +30,13 @@ class PadOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } }; class PadOpMaker : public framework::OpProtoAndCheckerMaker { @@ -98,6 +105,14 @@ class PadOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, dout_dims); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } }; template diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 6e8b962488a56800eb0ad985acd0113fe9fd2422..e980aa66e7ca33467cfe216fbf04e3b5649d9c15 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -114,11 +114,6 @@ class ReshapeOp : public framework::OperatorWithKernel { return; } - PADDLE_ENFORCE_EQ(!shape.empty(), - true, - platform::errors::InvalidArgument( - "The parameter 'shape' in ReshapeOp must be set. " - "But received 'shape' is empty.")); auto x_dims = ctx->GetInputDim("X"); auto out_dims = ValidateShape(shape, x_dims); ctx->SetOutputDim("Out", out_dims); diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index bfadb45631210987410fc4b106859f4b146eabf3..7b023bcdf662cccfdfe89f9a2074c6a04bbfad33 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -195,17 +195,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class Squeeze2Op : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); - } -}; - template class SqueezeGradOpMaker : public framework::SingleGradOpMaker { public: @@ -220,32 +209,6 @@ class SqueezeGradOpMaker : public framework::SingleGradOpMaker { } }; -class Squeeze2GradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK( - context->HasInput("XShape"), "Input", "XShape", "Squeeze2Grad"); - OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "Squeeze2Grad"); - auto xshape_dims = context->GetInputDim("XShape"); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - context->SetOutputDim(framework::GradVarName("X"), x_dims); - context->ShareLoD("XShape", framework::GradVarName("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); - } -}; - template class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { public: @@ -259,82 +222,6 @@ class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { } }; -// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze, -// the XShape is used to carry the shape and lod of X which will be used in -// squeeze_grad, in this way, the framework can reuse the memory of X -// immediately the squeeze2_op is finished. -// Considering compatibility issues, we could not fix squeeze2_op -class Squeeze2OpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor). The input tensor of squeeze operator."); - AddOutput("Out", "(Tensor). The output tensor of squeeze operator."); - AddOutput("XShape", - "XShape is just used to store the shape and lod of X, which will " - "be used in SqueezeGradOp.") - .AsIntermediate() - .AsExtra(); - AddAttr>("axes", - "(std::vector). List of integers," - " indicating the dimensions to squeeze.") - .SetDefault({}) - .SupportTensor(); - AddComment(R"DOC( - Squeeze2 Operator. - - Remove single-dimensional entries from the shape of a tensor. - Takes a parameter axes with a list of axes to squeeze. - If axes is not provided, all the single dimensions will be removed from the shape. - If an axis is selected with shape entry not equal to one, an error is raised. - - Examples: - Case 1: - Given - X.shape = (1, 3, 1, 5) - and - axes = [0] - we get: - Out.shape = (3, 1, 5) - - Case 2: - Given - X.shape = (1, 3, 1, 5) - and - axes = [] - we get: - Out.shape = (3, 5) - )DOC"); - } -}; - -template -class Squeeze2GradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("squeeze2_grad"); - grad_op->SetInput("XShape", this->Output("XShape")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -template -class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("squeeze2"); - grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); - grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); - grad_op->SetOutput("XShape", this->Input("XShape")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"}); DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer, {framework::GradVarName("Out"), @@ -345,10 +232,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X"); namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, - SqueezeInferShapeFunctor, - PD_INFER_META(phi::SqueezeWithXShapeInferMeta)); - REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, @@ -360,19 +243,6 @@ REGISTER_OPERATOR(squeeze_grad, ops::SqueezeDoubleGradOpMaker, ops::SqueezeGradNoNeedBufferVarsInferer); -REGISTER_OPERATOR(squeeze2, - ops::Squeeze2Op, - ops::Squeeze2OpMaker, - ops::Squeeze2GradOpMaker, - ops::Squeeze2GradOpMaker, - ops::SqueezeInplaceInferer, - SqueezeInferShapeFunctor); -REGISTER_OPERATOR(squeeze2_grad, - ops::Squeeze2GradOp, - ops::Squeeze2DoubleGradOpMaker, - ops::Squeeze2DoubleGradOpMaker, - ops::SqueezeGradInplaceInferer); - REGISTER_OP_CPU_KERNEL( squeeze, ops::SqueezeKernel, diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 8f28e0b606b0351d62c3a1429c3af8c988366a90..d092c03a56398488e44ab6bce5162b66568e607f 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -260,83 +260,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { } }; -// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on -// unsqueeze, the XShape is used to carry the shape and lod of X which -// will be used in unsqueeze_grad, in this way, the framework can reuse -// the memory of X immediately the unsqueeze2_op is finished. -// Considering compatibility issues, we could not fix unsqueeze2_op -class Unsqueeze2Op : public UnsqueezeOp { - public: - using UnsqueezeOp::UnsqueezeOp; -}; - -class Unsqueeze2OpMaker : public UnsqueezeOpMaker { - public: - void Make() override { - UnsqueezeOpMaker::Make(); - AddOutput("XShape", - "XShape is just used to store the shape and lod of X, which will " - "be used in UnsqueezeGradOp.") - .AsIntermediate() - .AsExtra(); - } -}; - -template -class Unsqueeze2GradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("unsqueeze2_grad"); - grad_op->SetInput("XShape", this->Output("XShape")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class Unsqueeze2GradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE_EQ( - context->HasInput("XShape"), - true, - platform::errors::InvalidArgument("Input(XShape) shouldn't be null.")); - PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), - true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) shouldn't be null.")); - auto xshape_dims = context->GetInputDim("XShape"); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - context->SetOutputDim(framework::GradVarName("X"), x_dims); - context->ShareLoD("XShape", framework::GradVarName("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); - } -}; - -template -class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("unsqueeze2"); - grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); - grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); - grad_op->SetOutput("XShape", this->Input("XShape")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"}); DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer, {framework::GradVarName("Out"), @@ -345,10 +268,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X"); } // namespace operators } // namespace paddle -DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, - Unsqueeze2InferShapeFunctor, - PD_INFER_META(phi::UnsqueezeWithXShapeInferMeta)); - namespace ops = paddle::operators; REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, @@ -362,20 +281,6 @@ REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeDoubleGradOpMaker, ops::UnsqueezeGradOpNoNeedBufferVarInferer); -REGISTER_OPERATOR(unsqueeze2, - ops::Unsqueeze2Op, - ops::Unsqueeze2OpMaker, - ops::Unsqueeze2GradOpMaker, - ops::Unsqueeze2GradOpMaker, - Unsqueeze2InferShapeFunctor, - ops::UnsqueezeInplaceInferer); - -REGISTER_OPERATOR(unsqueeze2_grad, - ops::Unsqueeze2GradOp, - ops::Unsqueeze2DoubleGradOpMaker, - ops::Unsqueeze2DoubleGradOpMaker, - ops::UnsqueezeGradInplaceInferer); - REGISTER_OP_CPU_KERNEL( unsqueeze, ops::UnsqueezeKernel, diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 9a791e4f2e36243931216b409e03d83de8e26865..d314a9a7835190643b165ae287a52531d87b4b9d 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -646,6 +646,7 @@ void BindAnalysisConfig(py::module *m) { py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) + .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def("set_exec_stream", [](AnalysisConfig &self, phi::CUDAStream &stream) { diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index aec5c7632a8663f94830897a0bc6e0b7876ffa07..7be9e8fb187378e2e9028f5585e8c19cdf8792e8 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -44,11 +44,7 @@ set(PHI_DEPS get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) -if(APPLE AND WITH_ARM) - cc_library(phi DEPS ${PHI_DEPS}) -else() - create_dummy_static_lib(phi LIBS ${PHI_DEPS} LIMIT 100) -endif() +cc_library(phi DEPS ${PHI_DEPS}) set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index 7233744c65c3fd482810608cb04b6be5092e7f7b..17c0dd3f8732dde96d371d99bc8798692146a3f3 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -19,7 +19,7 @@ limitations under the License. */ // Note(chenweihang): In order to be compatible with the original custom // operator Tensor interface, only available to external users, the file -// cannot be includeed in paddle +// cannot be included in paddle namespace paddle { using Tensor = experimental::Tensor; diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index dead42d03f7bcfdfc3fc6b0deabcf3ec8e4bb54d..8f107f02dafafad4c00ffcfcb5b51d3d551c5213 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1186,6 +1186,26 @@ backward : square_double_grad inplace : (out_grad -> x_grad) +- backward_op : squeeze_double_grad + forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x) + args : (Tensor grad_x_grad, IntArray axis) + output : Tensor(grad_out_grad), Tensor(xshape) + invoke: squeeze(grad_x_grad, axis) + intermediate : xshape + +- backward_op : squeeze_grad + forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape) + args : (Tensor xshape, Tensor out_grad, IntArray axis) + output : Tensor(x_grad) + infer_meta : + func : KernelWithXShapeInferMeta + param: [xshape] + kernel : + func : squeeze_grad + data_type : out_grad + inplace : (out_grad -> x_grad) + backward: squeeze_double_grad + - backward_op : svd_grad forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh) args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices) @@ -1321,6 +1341,27 @@ data_type : out_grad no_need_buffer : x +- backward_op : unsqueeze_double_grad + forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x) + args : (Tensor grad_x_grad, IntArray axes) + output : Tensor(grad_out_grad), Tensor(xshape) + invoke : unsqueeze(grad_x_grad, axes) + intermediate : xshape + +- backward_op : unsqueeze_grad + forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape) + args : (Tensor xshape, Tensor out_grad, IntArray axes) + output : Tensor(x_grad) + infer_meta : + func : KernelWithXShapeInferMeta + param: [xshape] + kernel : + func : unsqueeze_grad + param : [xshape, out_grad] + data_type : out_grad + inplace : (out_grad -> x_grad) + backward : unsqueeze_double_grad + - backward_op : unstack_grad forward : unstack (Tensor x, int axis=0, int num=0) -> Tensor[](out) args : (Tensor[] out_grad, int axis) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 8d7af90a90a59a1b2e45d3ab235aa0c0c45a1c72..acc7b670ba524945fd32ff6d81ab351d18a3e268 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -1363,24 +1363,6 @@ kernel : func : squared_l2_norm_grad -- backward_op : squeeze_double_grad - forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray axis) - output : Tensor(grad_out_grad) - invoke: squeeze(grad_x_grad, axis) - -- backward_op : squeeze_grad - forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape) - args : (Tensor xshape, Tensor out_grad, IntArray axis) - output : Tensor(x_grad) - infer_meta : - func : KernelWithXShapeInferMeta - param: [xshape] - kernel : - func : squeeze_grad - inplace : (out_grad -> x_grad) - backward: squeeze_double_grad - - backward_op : stack_grad forward : stack (Tensor[] x, int axis) -> Tensor(out) args : (Tensor[] x, Tensor out_grad, int axis) @@ -1574,25 +1556,6 @@ func : uniform_inplace_grad inplace : (out_grad -> x_grad) -- backward_op : unsqueeze_double_grad - forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray axes) - output : Tensor(grad_out_grad) - invoke : unsqueeze(grad_x_grad, axes) - -- backward_op : unsqueeze_grad - forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape) - args : (Tensor xshape, Tensor out_grad, IntArray axes) - output : Tensor(x_grad) - infer_meta : - func : KernelWithXShapeInferMeta - param: [xshape] - kernel : - func : unsqueeze_grad - param: [xshape, out_grad] - inplace : (out_grad -> x_grad) - backward : unsqueeze_double_grad - - backward_op : warpctc_grad forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad) args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index b93ca2944ab85f686a2fa6c83c5ba0455baeba92..6dfff5d510d65e5353aa7e79e29a159f0602dde6 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1777,18 +1777,6 @@ func : squared_l2_norm backward : squared_l2_norm_grad -- op : squeeze - args : (Tensor x, IntArray axis) - output : Tensor(out), Tensor(xshape) - infer_meta : - func : SqueezeWithXShapeInferMeta - kernel : - func : squeeze_with_xshape - inplace : (x -> out) - view: (x -> out) - intermediate : xshape - backward : squeeze_grad - - op : stack args : (Tensor[] x, int axis) output : Tensor @@ -2022,18 +2010,6 @@ data_type: x backward: unpool3d_grad -- op : unsqueeze - args : (Tensor x, IntArray axis) - output : Tensor(out), Tensor(xshape) - infer_meta : - func : UnsqueezeWithXShapeInferMeta - kernel : - func : unsqueeze_with_xshape - inplace : (x -> out) - view: (x -> out) - intermediate : xshape - backward : unsqueeze_grad - - op : update_loss_scaling_ args : (Tensor[] x, Tensor found_infinite, Tensor prev_loss_scaling, Tensor in_good_steps, Tensor in_bad_steps, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, Scalar stop_update) output : Tensor[](out){x.size()}, Tensor(loss_scaling), Tensor(out_good_steps), Tensor(out_bad_steps) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 7e960d73bbbd71c2895b472cce0459280a54da13..cb6f67fbdf26641352086aa6cf3f9f475a65eea2 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -1270,9 +1270,20 @@ attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : squeeze (squeeze2) - backward : squeeze_grad (squeeze2_grad) + backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad) + inputs : + x : X + attrs : + axis : axes + outputs : + {out : Out, xshape : XShape} + int_array: + axis : + data_type : int + support_tensor : true extra : attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] + outputs : [xshape] - op : stack backward : stack_grad @@ -1389,6 +1400,22 @@ outputs : out : Y +- op : unsqueeze (unsqueeze2) + backward : unsqueeze_grad (unsqueeze2_grad), unsqueeze_double_grad(unsqueeze2_double_grad) + inputs : + x : X + attrs : + axis : axes + outputs : + {out : Out, xshape : XShape} + int_array: + axis : + data_type : int + tensor_name : AxesTensor + tensors_name : AxesTensorList + extra : + outputs : [xshape] + - op : unstack backward : unstack_grad inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 0e85b2d8dffaf7db2256e7b0f6540a82e4c8220c..e5378ce07718b033921b668119f7751d9fa7e391 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1054,6 +1054,19 @@ square_sr {selected_rows -> selected_rows} backward : square_grad +- op : squeeze + args : (Tensor x, IntArray axis={}) + output : Tensor(out), Tensor(xshape) + infer_meta : + func : SqueezeWithXShapeInferMeta + kernel : + func : squeeze_with_xshape + data_type : x + inplace : (x -> out) + view: (x -> out) + intermediate : xshape + backward : squeeze_grad + - op : svd args : (Tensor x, bool full_matrices = false) output : Tensor(u), Tensor(s), Tensor(vh) @@ -1149,6 +1162,19 @@ func : unfold backward : unfold_grad +- op : unsqueeze + args : (Tensor x, IntArray axis = {}) + output : Tensor(out), Tensor(xshape) + infer_meta : + func : UnsqueezeWithXShapeInferMeta + kernel : + func : unsqueeze_with_xshape + data_type : x + inplace : (x -> out) + view: (x -> out) + intermediate : xshape + backward : unsqueeze_grad + - op : unstack args : (Tensor x, int axis=0, int num=0) output : Tensor[](out){num} diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 895d50c7bbd2a7b5c63295efa1aaea1b5bbd279d..8a5247ae64baa10c3a234d0c0b98749376c31073 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -917,9 +917,6 @@ void ExpandInferMeta(const MetaTensor& x, auto out_rank = std::max(static_cast(x_dims.size()), expand_shape.size()); std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - auto diff = expand_shape.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); for (size_t i = 0; i < expand_shape.size(); ++i) { if (x_dims[i] == -1) { out_shape[i] = -1; diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index e12c5f10fd1c4cb5a0da65d044f606e4af9f709b..25bbd17c4feab2dbc4a57b6578ebc3662ab83fcc 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -106,8 +106,7 @@ file( "fusion/gpu/*.cu") if(WITH_CUTLASS) - file(GLOB cutlass_cu "fusion/cutlass/default_moe_fc_traits.h" - "fusion/cutlass/linear_combination_ft_gelu.h" "fusion/cutlass/moe*") + file(GLOB cutlass_cu "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu") list(APPEND kernel_cu ${cutlass_cu}) endif() diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 49020337e08d8c5033338fdf0174de90ee98d1bf..cf974bdbe333b69fe3e7f10620c324aea14a9d19 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -1023,15 +1023,20 @@ void BroadcastKernel(const KPDevice &ctx, std::vector *outs, int axis, Functor func) { - std::vector dims_size; - dims_size.reserve(ins.size()); + // When there are multiple inputs, the outputs's rank should be equal the + // maximum rank of all inputs. + int max_rank = 0; + int min_rank = phi::DDim::kMaxRank; for (auto *in : ins) { - dims_size.emplace_back(in->dims().size()); + max_rank = std::max(max_rank, in->dims().size()); + min_rank = std::min(min_rank, in->dims().size()); } - - axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) - - *std::min_element(dims_size.begin(), dims_size.end()) - : axis; + if (ins.size() == 1) { + // When there is only 1 input, the input's rank may be less than outputs' + // rank. + max_rank = std::max(max_rank, (*outs)[0]->dims().size()); + } + axis = axis == -1 ? max_rank - min_rank : axis; BroadcastKernelForDifferentVecSize( ctx, ins, outs, axis, func); } diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h index 3912357546944734dc7101122cda298d23a93b17..a52373c117e3ee113230d2e04f5604b9d58793b0 100644 --- a/paddle/phi/kernels/funcs/dims_simplifier.h +++ b/paddle/phi/kernels/funcs/dims_simplifier.h @@ -25,8 +25,8 @@ struct BroadcastDimsSimplifier { typedef void (*MergeFunctor)( bool &, std::vector &, DimVector &, int, int); - int64_t N; - int64_t rank; + int N; + int rank; DimVector out_dims; std::vector in_dims; @@ -103,41 +103,43 @@ struct BroadcastDimsSimplifier { // To compensate the lackage of input_tensors' dimension with axis. void ExtendInputDimensions(int N, int axis) { for (auto &in_dim : in_dims) { - int64_t in_idx = 0; if (in_dim.size() < rank) { - DimVector tmp_dim(rank, 1); - for (; in_idx < in_dim.size();) { - if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { - tmp_dim[axis] = in_dim[in_idx]; - in_idx++; - axis++; + DimVector extended_in_dim(rank, 1); + int out_idx = axis; + for (int in_idx = 0; in_idx < in_dim.size(); in_idx++) { + if (in_dim[in_idx] == out_dims[out_idx] || in_dim[in_idx] == 1) { + extended_in_dim[out_idx] = in_dim[in_idx]; + out_idx++; } else { PADDLE_THROW(phi::errors::InvalidArgument( "The %d-th dimension of input tensor is expected to be equal " "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - axis + 1, + "received %d. The input's shape is {%s}, the output's shape is " + "{%s}.", + in_idx, + out_idx, out_dims[axis], - in_dim[in_idx])); + in_dim[in_idx], + phi::make_ddim(in_dim), + phi::make_ddim(out_dims))); } } in_dim.resize(rank); - std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + std::copy( + extended_in_dim.begin(), extended_in_dim.end(), in_dim.begin()); } else { - for (; in_idx < rank;) { - if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { - in_idx++; - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - in_idx + 1, - out_dims[in_idx], - in_dim[in_idx])); - } + for (int in_idx = 0; in_idx < rank; in_idx++) { + PADDLE_ENFORCE_EQ( + in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1, + true, + phi::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "received %d.", + in_idx, + in_idx, + out_dims[in_idx], + in_dim[in_idx])); } } std::reverse(in_dim.begin(), in_dim.end()); diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu new file mode 100644 index 0000000000000000000000000000000000000000..308fd276c12be527d8fb21078eb6e95ba2ee4e6b --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu @@ -0,0 +1,225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { + +template +cutlass::Status Conv2dBiasImpl(ConvAllParams params) { + using ElementAccumulator = float; + using ElementComputeEpilogue = float; + using ElementInputA = cutlass::half_t; + using ElementInputB = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutInputA = cutlass::layout::TensorNHWC; + using LayoutInputB = cutlass::layout::TensorNHWC; + using LayoutOutput = cutlass::layout::TensorNHWC; + using MMAOp = cutlass::arch::OpClassTensorOp; + using SmArch = cutlass::arch::Sm75; + using ThreadblockShape = TShape; + using WarpShape = WShape; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using SwizzleThreadBlock = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>; + constexpr int NumStages = 2; + static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = + cutlass::conv::IteratorAlgorithm::kOptimized; + using EpilogueOp = + cutlass::epilogue::thread::LinearCombination; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + mode, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>( + ConvAllParams); +// config 1 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>( + ConvAllParams); +// config 2 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>( + ConvAllParams); +// config 3 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>( + ConvAllParams); +// config 4 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 32>>( + ConvAllParams); +// config 5 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 64, 32>>( + ConvAllParams); +// config 6 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 64, 32>>( + ConvAllParams); +// config 7 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 64, 32>>( + ConvAllParams); +// config 8 +template cutlass::Status Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 32, 32>>( + ConvAllParams); + +std::vector> + conv2d_bias_all_func = { + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasImpl, + cutlass::gemm::GemmShape<64, 32, 32>>}; + +std::map, int> map_problem_conv2d_bias; +std::mutex conv2d_bias_mutex; + +void Conv2dBias(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias.count(problem_size)) { + conv2d_bias_all_func[map_problem_conv2d_bias.at(problem_size)](params); + return; + } + + int best_config_index = + ProfileToGetBestConfig(conv2d_bias_all_func, params, CONV2D_BIAS); + + std::lock_guard guard(conv2d_bias_mutex); + map_problem_conv2d_bias[problem_size] = best_config_index; + conv2d_bias_all_func[best_config_index](params); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..3fac4f5673b7f0bb09b3d1afca4213610227a1c0 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu @@ -0,0 +1,248 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h" +#include "cutlass/epilogue/thread/linear_combination_residual_block.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { + +namespace cutlass_internal { + +template +cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) { + using EpilogueOp = cutlass::epilogue::thread::LinearCombinationResidualBlock< + cutlass::half_t, + float, + float, + cutlass::half_t, + Alignment, + cutlass::epilogue::thread::Identity, + cutlass::plus, + cutlass::epilogue::thread::ReLu>; + + using Conv2dFpropKernel = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + TShape, + WShape, + cutlass::gemm::GemmShape<16, 8, 8>, + EpilogueOp, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + const half *residual = params.residual; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Conv2dProblemSize problem_size( + {batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), + nullptr, + 0, + oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 1 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 2 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 3 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 4 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams); +// config 5 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams); +// config 6 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 7 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 8 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams); +// config 9 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 10 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 11 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 12 +template cutlass::Status + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); + +std::vector> + conv2d_bias_add_relu_all_func = { + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasAddReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>}; +std::map, int> map_problem_conv2d_bias_add_relu; +std::mutex conv2d_bias_add_relu_mutex; + +void Conv2dBiasAddRelu(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias_add_relu.count(problem_size)) { + conv2d_bias_add_relu_all_func[map_problem_conv2d_bias_add_relu.at( + problem_size)](params); + return; + } + + std::lock_guard guard(conv2d_bias_add_relu_mutex); + + // config 6's diff is large. + conv2d_bias_add_relu_all_func[6] = nullptr; + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_add_relu_all_func, params, CONV2D_BIAS_ADD_RELU); + map_problem_conv2d_bias_add_relu[problem_size] = best_config_index; + conv2d_bias_add_relu_all_func[best_config_index](params); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..97ca75e477644ccddf83d4dd24b3b1b98cc04769 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu @@ -0,0 +1,226 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +template +cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) { + using ElementAccumulator = float; + using ElementComputeEpilogue = float; + using ElementInputA = cutlass::half_t; + using ElementInputB = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutInputA = cutlass::layout::TensorNHWC; + using LayoutInputB = cutlass::layout::TensorNHWC; + using LayoutOutput = cutlass::layout::TensorNHWC; + using MMAOp = cutlass::arch::OpClassTensorOp; + using SmArch = cutlass::arch::Sm75; + using ThreadblockShape = TShape; + using WarpShape = WShape; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using SwizzleThreadBlock = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>; + constexpr int NumStages = 2; + static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = + cutlass::conv::IteratorAlgorithm::kOptimized; + using EpilogueOp = cutlass::epilogue::thread::LinearCombinationLeakyRelu< + ElementOutput, + Alignment, + float, + ElementComputeEpilogue>; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + float alpha = params.alpha; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + mode, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 1 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 2 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 3 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 4 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams); +// config 5 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams); +// config 6 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 7 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 8 +template cutlass::Status Conv2dBiasLeakyReluImpl< + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams); + +std::vector> + conv2d_bias_leaky_relu_all_func = { + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasLeakyReluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>}; + +std::map, int> map_problem_conv2d_bias_leaky_relu; +std::mutex conv2d_bias_leaky_relu_mutex; + +void Conv2dBiasLeakyRelu(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias_leaky_relu.count(problem_size)) { + conv2d_bias_leaky_relu_all_func[map_problem_conv2d_bias_leaky_relu.at( + problem_size)](params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_leaky_relu_all_func, params, CONV2D_BIAS_LEAKY_RELU); + + std::lock_guard guard(conv2d_bias_leaky_relu_mutex); + map_problem_conv2d_bias_leaky_relu[problem_size] = best_config_index; + conv2d_bias_leaky_relu_all_func[best_config_index](params); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5f5a9bee12c644b12d5b493407e17b7ccaef6e1 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu @@ -0,0 +1,225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +template +cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) { + using ElementAccumulator = float; + using ElementComputeEpilogue = float; + using ElementInputA = cutlass::half_t; + using ElementInputB = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutInputA = cutlass::layout::TensorNHWC; + using LayoutInputB = cutlass::layout::TensorNHWC; + using LayoutOutput = cutlass::layout::TensorNHWC; + using MMAOp = cutlass::arch::OpClassTensorOp; + using SmArch = cutlass::arch::Sm75; + using ThreadblockShape = TShape; + using WarpShape = WShape; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using SwizzleThreadBlock = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>; + constexpr int NumStages = 2; + static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = + cutlass::conv::IteratorAlgorithm::kOptimized; + using EpilogueOp = + cutlass::epilogue::thread::LinearCombinationRelu; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + mode, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 1 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 2 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 3 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 4 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams); +// config 5 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams); +// config 6 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 7 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 8 +template cutlass::Status + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams); + +std::vector> + conv2d_bias_relu_all_func = { + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasReluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>}; +std::map, int> map_problem_conv2d_bias_relu; +std::mutex conv2d_bias_relu_mutex; + +void Conv2dBiasRelu(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias_relu.count(problem_size)) { + conv2d_bias_relu_all_func[map_problem_conv2d_bias_relu.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_relu_all_func, params, CONV2D_BIAS_RELU); + + std::lock_guard guard(conv2d_bias_relu_mutex); + map_problem_conv2d_bias_relu[problem_size] = best_config_index; + conv2d_bias_relu_all_func[best_config_index](params); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu new file mode 100644 index 0000000000000000000000000000000000000000..1acd191033529eb3b0aff8c616fec21be59f5265 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu @@ -0,0 +1,218 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +template +cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) { + using ElementAccumulator = float; + using ElementComputeEpilogue = float; + using ElementInputA = cutlass::half_t; + using ElementInputB = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutInputA = cutlass::layout::TensorNHWC; + using LayoutInputB = cutlass::layout::TensorNHWC; + using LayoutOutput = cutlass::layout::TensorNHWC; + using MMAOp = cutlass::arch::OpClassTensorOp; + using SmArch = cutlass::arch::Sm75; + using ThreadblockShape = TShape; + using WarpShape = WShape; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using SwizzleThreadBlock = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>; + constexpr int NumStages = 2; + static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = + cutlass::conv::IteratorAlgorithm::kFewChannels; + using EpilogueOp = + cutlass::epilogue::thread::LinearCombinationRelu; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w1; + + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + mode, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 1 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 2 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 3 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 4 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams); +// config 5 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams); +// config 6 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 7 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 8 +template cutlass::Status Conv2dBiasReluFewChannelsImpl< + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams); + +std::vector> + conv2d_bias_relu_few_channels_all_func = { + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasReluFewChannelsImpl, + cutlass::gemm::GemmShape<64, 32, 32>>}; +std::map, int> map_problem_conv2d_bias_relu_few_channels; + +void Conv2dBiasReluFewChannels(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w1; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias_relu_few_channels.count(problem_size)) { + conv2d_bias_relu_few_channels_all_func + [map_problem_conv2d_bias_relu_few_channels.at(problem_size)](params); + return; + } + // +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu new file mode 100644 index 0000000000000000000000000000000000000000..469585ccf8398b9e49d12eb2accc1481eb9b84af --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu @@ -0,0 +1,226 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_silu.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +template +cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) { + using ElementAccumulator = float; + using ElementComputeEpilogue = float; + using ElementInputA = cutlass::half_t; + using ElementInputB = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutInputA = cutlass::layout::TensorNHWC; + using LayoutInputB = cutlass::layout::TensorNHWC; + using LayoutOutput = cutlass::layout::TensorNHWC; + using MMAOp = cutlass::arch::OpClassTensorOp; + using SmArch = cutlass::arch::Sm75; + using ThreadblockShape = TShape; + using WarpShape = WShape; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using SwizzleThreadBlock = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>; + constexpr int NumStages = 2; + static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = + cutlass::conv::IteratorAlgorithm::kOptimized; + using EpilogueOp = + cutlass::epilogue::thread::LinearCombinationSilu; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm, + cutlass::conv::StrideSupport::kStrided, + Alignment, + Alignment>::Kernel; + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + mode, + 1); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +// config 0 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 1 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 2 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 3 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams); +// config 4 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams); +// config 5 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams); +// config 6 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 7 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams); +// config 8 +template cutlass::Status + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams); + +std::vector> + conv2d_bias_silu_all_func = { + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 64>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 32, 32>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<32, 64, 32>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 64, 32>>, + Conv2dBiasSiluImpl, + cutlass::gemm::GemmShape<64, 32, 32>>}; + +std::map, int> map_problem_conv2d_bias_silu; +std::mutex conv2d_bias_silu_mutex; + +void Conv2dBiasSilu(ConvAllParams params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w}; + + if (map_problem_conv2d_bias_silu.count(problem_size)) { + conv2d_bias_silu_all_func[map_problem_conv2d_bias_silu.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_silu_all_func, params, CONV2D_BIAS_SILU); + + std::lock_guard guard(conv2d_bias_silu_mutex); + + map_problem_conv2d_bias_silu[problem_size] = best_config_index; + conv2d_bias_silu_all_func[best_config_index](params); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h new file mode 100644 index 0000000000000000000000000000000000000000..b740d49fc1dc3fb0e021e80f436e783c5a392aea --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { + +typedef struct { + const half *input; + const half *weight; + const half *bias; + const half *residual; + half *output; + int batch; + int ic; + int ih; + int iw; + int kh; + int kw; + int oc; + int pad_h0; + int pad_h1; + int pad_w0; + int pad_w1; + int stride_h; + int stride_w; + int dilation_h; + int dilation_w; + int oh; + int ow; + const phi::GPUContext *ctx; + float alpha; // for leaky_relu use +} ConvAllParams; + +// Below functions are provided by cutlass, they are called by phi. +void Conv2dBiasAddRelu(ConvAllParams params); +void Conv2dBiasRelu(ConvAllParams params); +void Conv2dBiasLeakyRelu(ConvAllParams params); +void Conv2dBiasSilu(ConvAllParams params); +void Conv2dBias(ConvAllParams params); +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu new file mode 100644 index 0000000000000000000000000000000000000000..174cb4aaa405956811c8b3203a1c08f97376be97 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu @@ -0,0 +1,277 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +struct logical_coord { + int n; + int c; + int h; + int w; +}; + +float diff(const half *c, const float *c_baseline, int n) { + float max_diff = -1.; + for (int i = 0; i < n; i++) { + float c_value = __half2float(c[i]); + if (std::abs(c_baseline[i] - c_value) > max_diff) { + max_diff = std::abs(c_baseline[i] - c_value); + } + } + return max_diff; +} + +__device__ int gpu_nhwc(struct logical_coord shape, + struct logical_coord index) { + return index.n * shape.h * shape.w * shape.c + index.h * shape.w * shape.c + + index.w * shape.c + index.c; +} + +__global__ void naive_conv2d_kernel(const half *input, + const half *weight, + const half *bias, + float *output, + int batch, + int ic, + int ih, + int iw, + int kh, + int kw, + int oc, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int oh, + int ow, + const half *residual, + float alpha, // for leaky_relu + OpType op_type) { + int M = batch * oh * ow; + int N = oc; + int K = ic * kh * kw; + int m_i = threadIdx.x + blockIdx.x * blockDim.x; + int n_i = threadIdx.y + blockIdx.y * blockDim.y; + if (m_i >= M || n_i >= N) return; + + int batch_i = m_i / (oh * ow); + int oh_i = (m_i % (oh * ow)) / ow; + int ow_i = (m_i % (oh * ow)) % ow; + int oc_i = n_i; + + struct logical_coord weight_shape = {oc, ic, kh, kw}; + struct logical_coord input_shape = {batch, ic, ih, iw}; + int out_offset = m_i * N + n_i; + float *out_ptr = output + out_offset; + float sum = 0.f; + + for (int k_i = 0; k_i < K; k_i++) { + int ic_i = k_i / (kh * kw); + int kh_i = (k_i % (kh * kw)) / kw; + int kw_i = (k_i % (kh * kw)) % kw; + + struct logical_coord weight_index = {oc_i, ic_i, kh_i, kw_i}; + + int ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h; + int iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w; + + if (ih_i < 0 || ih_i >= ih) continue; + if (iw_i < 0 || iw_i >= iw) continue; + + struct logical_coord input_index = {batch_i, ic_i, ih_i, iw_i}; + const half *weight_ptr = weight + gpu_nhwc(weight_shape, weight_index); + const half *in_ptr = input + gpu_nhwc(input_shape, input_index); + sum += __half2float(*in_ptr) * __half2float(*weight_ptr); + } + + sum += __half2float(*(bias + oc_i)); + float x = sum; + + switch (op_type) { + case CONV2D_BIAS: + *out_ptr = x; + break; + case CONV2D_BIAS_RELU: + *out_ptr = x > 0 ? x : 0; + break; + case CONV2D_BIAS_SILU: + *out_ptr = x * (1.f / (1 + exp(-x))); + break; + case CONV2D_BIAS_ADD_RELU: + x += __half2float(*(residual + out_offset)); + *out_ptr = x > 0 ? x : 0; + break; + case CONV2D_BIAS_LEAKY_RELU: + *out_ptr = x > 0 ? x : (x * alpha); + break; + default: + break; + } +} + +float conv2d_diff_gpu(ConvAllParams params, OpType op_type) { + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h = params.pad_h0; + int pad_w = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + const half *residual = params.residual; + + int oh = params.oh; + int ow = params.ow; + int M = batch * oh * ow; + int N = oc; + + constexpr int blockM = 16; + constexpr int blockN = 16; + uint3 grid = {(M + blockM - 1) / blockM, (N + blockN - 1) / blockN, 1}; + uint3 block = {blockM, blockN, 1}; + + int output_size = batch * oc * oh * ow; + half *output_from_cutlass = + reinterpret_cast(malloc(sizeof(half) * output_size)); + cudaMemcpy(output_from_cutlass, + output, + output_size * sizeof(half), + cudaMemcpyDeviceToHost); + + float *gpu_output; + cudaMalloc(&gpu_output, output_size * sizeof(float)); + naive_conv2d_kernel<<>>(input, + weight, + bias, + gpu_output, + batch, + ic, + ih, + iw, + kh, + kw, + oc, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + oh, + ow, + residual, + params.alpha, + op_type); + float *output_from_gpu = + reinterpret_cast(malloc(sizeof(float) * output_size)); + cudaMemcpy(output_from_gpu, + gpu_output, + output_size * sizeof(float), + cudaMemcpyDeviceToHost); + float max_diff = diff(output_from_cutlass, output_from_gpu, output_size); + + free(output_from_cutlass); + free(output_from_gpu); + cudaFree(gpu_output); + return max_diff; +} + +std::string OpType2String(OpType op_type) { + switch (op_type) { + case CONV2D_BIAS: + return "conv2d_bias"; + break; + case CONV2D_BIAS_RELU: + return "conv2d_bias_relu"; + break; + case CONV2D_BIAS_SILU: + return "conv2d_bias_add_silu"; + break; + case CONV2D_BIAS_ADD_RELU: + return "conv2d_bias_add_relu"; + break; + case CONV2D_BIAS_LEAKY_RELU: + return "conv2d_bias_leaky_relu"; + default: + break; + } + return "unnamed_op"; +} + +int ProfileToGetBestConfig( + const std::vector> &all_func, + ConvAllParams params, + OpType op_type) { + constexpr int WARMUP = 10; + constexpr int REPEAT = 100; + float min_time = 100000.f; + int min_time_index = -1; + for (int i = 0; i < all_func.size(); i++) { + cutlass::Status status; + auto func = all_func[i]; + // When func has large diff, we will make it nullptr. + if (!func) continue; + + for (int ii = 0; ii < WARMUP; ii++) { + status = func(params); + } + + cudaEvent_t beg, end; + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&beg)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&end)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(beg)); + for (int ii = 0; ii < REPEAT; ii++) { + status = func(params); + } + + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(end)); + float elapsed_time; + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(&elapsed_time, beg, end)); + if (elapsed_time < min_time && status == cutlass::Status::kSuccess) { + min_time = elapsed_time; + min_time_index = i; + } + // debug code + VLOG(3) << OpType2String(op_type) << ": tactic " << i << " has max diff " + << conv2d_diff_gpu(params, op_type) << " compared with baseline."; + } + + if (min_time_index < 0) { + PADDLE_THROW( + phi::errors::NotFound("Can't find any cutlass config for this %s op.", + OpType2String(op_type).c_str())); + } + return min_time_index; +} + +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h new file mode 100644 index 0000000000000000000000000000000000000000..a5d0f83651488ee718de1e07ba2ae96b998c6c52 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h" + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +#define CUTLASS_CHECK(status) \ + if (status != cutlass::Status::kSuccess) { \ + VLOG(3) \ + << "Cutlass can not deal with this problem size, skip this kernel!"; \ + return status; \ + } + +typedef enum { + CONV2D_BIAS, + CONV2D_BIAS_RELU, + CONV2D_BIAS_ADD_RELU, + CONV2D_BIAS_SILU, + CONV2D_BIAS_LEAKY_RELU +} OpType; + +// conv2d_diff_gpu calculate diff of cutlass output and baseline output, you can +// use them to debug. return value is the max diff between cutlass and baseline. +float conv2d_diff_gpu(ConvAllParams params, OpType op_type); + +int ProfileToGetBestConfig( + const std::vector>& all_func, + ConvAllParams params, + OpType op_type); + +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu b/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu new file mode 100644 index 0000000000000000000000000000000000000000..93c5581ce9db6f3b2d50c8b1872b07cc864124ba --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { +template +void Conv2dFusionKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& bias, + const paddle::optional& residual, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + const std::string& activation, + float fuse_alpha, + DenseTensor* output) { + ctx.template Alloc(output); + auto in_dims = x.dims(); + auto filter_dims = filter.dims(); + auto out_dims = output->dims(); + CHECK_EQ(in_dims.size() == 4UL, true); + CHECK_EQ(filter_dims.size() == 4UL, true); + CHECK_EQ(strides.size() == 2UL, true); + CHECK_EQ(dilations.size() == 2UL, true); + CHECK_EQ(groups == 1, true); + CHECK_EQ(padding_algorithm == "EXPLICIT", true); + const int batch = in_dims[0]; + const int ic = in_dims[3]; + const int ih = in_dims[1]; + const int iw = in_dims[2]; + int pad_h0 = 0; + int pad_h1 = 0; + int pad_w0 = 0; + int pad_w1 = 0; + if (paddings.size() == 2UL) { + pad_h0 = paddings[0]; + pad_h1 = paddings[0]; + pad_w0 = paddings[1]; + pad_w1 = paddings[1]; + } else if (paddings.size() == 4UL) { + pad_h0 = paddings[0]; + pad_h1 = paddings[1]; + pad_w0 = paddings[2]; + pad_w1 = paddings[3]; + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have " + "%u elements.", + paddings.size())); + } + + const int stride_h = strides[0]; + const int stride_w = strides[1]; + const int dilation_h = dilations[0]; + const int dilation_w = dilations[1]; + const int oc = filter_dims[0]; + const int kh = filter_dims[1]; + const int kw = filter_dims[2]; + + CHECK_EQ(out_dims.size() == 4UL, true); + const int oh = out_dims[1]; + const int ow = out_dims[2]; + + ConvAllParams params = {reinterpret_cast(x.data()), + reinterpret_cast(filter.data()), + reinterpret_cast(bias.data()), + nullptr, + reinterpret_cast(output->data()), + batch, + ic, + ih, + iw, + kh, + kw, + oc, + pad_h0, + pad_h1, + pad_w0, + pad_w1, + stride_h, + stride_w, + dilation_h, + dilation_w, + oh, + ow, + &ctx}; + + if (residual) { + if (activation == "relu") { + params.residual = reinterpret_cast(residual->data()); + Conv2dBiasAddRelu(params); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Cutlass now only support relu activation in a residual block")); + } + } else if (activation == "relu") { + Conv2dBiasRelu(params); + } else if (activation == "swish") { + Conv2dBiasSilu(params); + } else if (activation == "identity") { + Conv2dBias(params); + } else if (activation == "leaky_relu") { + params.alpha = fuse_alpha; + Conv2dBiasLeakyRelu(params); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Cutlass does not support this activation: %s.", activation.c_str())); + } + output->set_layout(DataLayout::NHWC); +} +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_fusion_cutlass, + GPU, + ALL_LAYOUT, + phi::fusion::cutlass_internal::Conv2dFusionKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu index e8729942b6e6c9458c072f4baccea272b5512978..35a6681b7afa3b133095b20e4ddc603847e583cd 100644 --- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu @@ -17,7 +17,28 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" + +namespace phi { + +template +void ExpandGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const IntArray& shape, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + if (x_grad->dims() == out_grad.dims()) { + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + } else { + std::vector reduce_dims = + funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1); + funcs::ReduceKernel>( + ctx, out_grad, x_grad, kps::IdentityFunctor(), reduce_dims); + } +} + +} // namespace phi PD_REGISTER_KERNEL(expand_grad, GPU, @@ -26,5 +47,6 @@ PD_REGISTER_KERNEL(expand_grad, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu index 27c4e82c6354440e892899f02e8e05171a504e1a..b2f973b0a8896a0eb24679bf7ee989446a7d25a7 100644 --- a/paddle/phi/kernels/gpu/expand_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_kernel.cu @@ -18,7 +18,66 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/expand_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" + +namespace phi { + +template +void ExpandKernel(const Context& ctx, + const DenseTensor& x, + const IntArray& shape, + DenseTensor* out) { + auto expand_shape = shape.GetData(); + auto diff = expand_shape.size() - x.dims().size(); + auto out_shape = phi::vectorize(x.dims()); + out_shape.insert(out_shape.begin(), diff, 1); + for (size_t i = 0; i < out_shape.size(); ++i) { + PADDLE_ENFORCE_NE( + expand_shape[i], + 0, + phi::errors::InvalidArgument("The expanded size cannot be zero.")); + if (i < diff) { + PADDLE_ENFORCE_GT( + expand_shape[i], + 0, + phi::errors::InvalidArgument( + "The expanded size (%d) for non-existing dimensions must be " + "positive for expand kernel.", + expand_shape[i])); + out_shape[i] = expand_shape[i]; + } else if (expand_shape[i] > 0) { + if (out_shape[i] != 1) { + PADDLE_ENFORCE_EQ( + out_shape[i], + expand_shape[i], + phi::errors::InvalidArgument( + "The value (%d) of the non-singleton dimension does not match" + " the corresponding value (%d) in shape for expand kernel.", + out_shape[i], + expand_shape[i])); + } else { + out_shape[i] = expand_shape[i]; + } + } else { + PADDLE_ENFORCE_EQ( + expand_shape[i], + -1, + phi::errors::InvalidArgument( + "When the value in shape is negative for expand_v2 op, " + "only -1 is supported, but the value received is %d.", + expand_shape[i])); + } + } + + out->Resize(phi::make_ddim(out_shape)); + ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + phi::funcs::BroadcastKernel( + ctx, ins, &outs, -1, kps::IdentityFunctor()); +} + +} // namespace phi PD_REGISTER_KERNEL(expand, GPU, @@ -27,6 +86,7 @@ PD_REGISTER_KERNEL(expand, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t, bool) {} diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu index 6e9dbf37a910015d9dfd7f701222d262f0a825b0..7945d6c8fcbafc9b70d91f5d3c34e14f1ca03fc6 100644 --- a/paddle/phi/kernels/gpu/flip_kernel.cu +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -101,6 +101,9 @@ void FlipKernel(const Context& dev_ctx, DenseTensor* out) { const size_t total_dims = x.dims().size(); switch (total_dims) { + case 0: + LaunchFlipCudaKernel(dev_ctx, x, axis, out); + break; case 1: LaunchFlipCudaKernel(dev_ctx, x, axis, out); break; diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 22ff9b3e1a8347104bb4f7fdd4ec322c4b3dd6a6..6963d6a06d8203388a3de0fa3bbcd40cdc6a90bf 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -53,9 +53,24 @@ KernelSignature Conv2dDoubleGradOpArgumentMapping( {"DInput", "DFilter", "DDOutput"}); } +KernelSignature Conv2dFusionArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_fusion_cutlass", + {"Input", "Filter", "Bias", "ResidualData"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "activation", + "fuse_alpha"}, + {"Output"}); +} } // namespace phi PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_fusion_cutlass, + phi::Conv2dFusionArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, phi::Conv2dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc deleted file mode 100644 index 4ca45903acfa00386c9cbfed191ddb9b50443230..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/squeeze_sig.cc +++ /dev/null @@ -1,35 +0,0 @@ - -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); -} - -KernelSignature SqueezeGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); -} - -} // namespace phi -PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze); -PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad); -PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc deleted file mode 100644 index 568097298b7acc86584b2de962e9ea06d73a26f5..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/unsqueeze_sig.cc +++ /dev/null @@ -1,47 +0,0 @@ - -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.InputSize("AxesTensorList") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensorList"; - return KernelSignature( - "unsqueeze_with_xshape", {"X"}, {"AxesTensorList"}, {"Out", "XShape"}); - } else if (ctx.InputSize("AxesTensor") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensor"; - return KernelSignature( - "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); - } else { - VLOG(2) << "unsqueeze2 in axes"; - return KernelSignature( - "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); - } -} - -KernelSignature UnsqueezeGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "unsqueeze_grad", {"XShape", "Out@GRAD"}, {}, {"X@GRAD"}); -} -} // namespace phi -PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze); -PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad); - -PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad, - phi::UnsqueezeGradOpArgumentMapping); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8afa300b57ac5807bee4bfb4527ebc339b2f78a8..a234e4906ff83fa3e16a4e804092717ffafb6b91 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1843,6 +1843,11 @@ function precise_card_test_single { for case in $(echo $testcases | tr "$|^" "\n" | awk '!/^$/') do cd ${PADDLE_ROOT}/build + + find paddle/fluid -name *.gcda | xargs rm -f + find paddle/phi -name *.gcda | xargs rm -f + find paddle/utils -name *.gcda | xargs rm -f + precise_card_test "^${case}$" $num #if test failed,continue,if test succeed ,go on @@ -1876,9 +1881,6 @@ function precise_card_test_single { fi mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case fi - find paddle/fluid -name *.gcda | xargs rm -f - find paddle/phi -name *.gcda | xargs rm -f - find paddle/utils -name *.gcda | xargs rm -f done } @@ -1988,6 +1990,10 @@ set +x fi read testcase <<< $(echo "$line"|grep -oEi "\w+$") + if [[ "$testcase" == "simple_precision_test" ]]; then + continue + fi + if [[ "$is_multicard" == "" ]]; then # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_") @@ -2032,6 +2038,8 @@ set -x mkdir -p ${PADDLE_ROOT}/build/ut_map mkdir -p ${PADDLE_ROOT}/build/pytest #run all unittest to get the coverage information of .c and .h files + precise_card_test_single "^simple_precision_test$" 1 + wait; precise_card_test_single "$single_card_tests" 1 precise_card_test_single "$single_card_tests_1" 1 precise_card_test_single "$multiple_card_tests" 2 diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index 3f41ebaa96d07aa10de6e38bf2c80791f2c4b24d..5c97fe90a2e1771dd8edb47d54df54d3eaa51e99 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -20,11 +20,11 @@ __all__ = [] import paddle from paddle.common_ops_import import LayerHelper -from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.optimizer import Momentum, Optimizer from paddle.framework import core +from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops from paddle.static import create_global_var @@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer): self._dgc_clip_norm = None if grad_clip is not None: - if not isinstance(grad_clip, GradientClipByNorm): + if not isinstance(grad_clip, ClipGradByNorm): raise TypeError( - "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" + "The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm" ) assert isinstance(num_trainers, int), ( "The type of num_trainers should be 'int', but received %s" diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 75f0061b2ca20be4c7f4f7dc10bf3c48a8374366..9eca2e667a8fd8c81aa3a4b1083ada9204cbecb6 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -15,9 +15,8 @@ import paddle from paddle import framework from paddle.autograd import no_grad -from paddle.fluid import layers -from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.framework import core +from paddle.nn import ClipGradByGlobalNorm, clip from ...base.topology import ParallelMode from ...utils.hybrid_parallel_util import ( @@ -62,8 +61,8 @@ class HybridParallelClipGrad: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + merge_grad = clip.merge_selected_rows(g) + merge_grad = clip.get_tensor_from_selected_rows(merge_grad) square = paddle.square(merge_grad) sum_square = paddle.sum(square) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py index b1a572d4edfc30d9fdccc45b1b056ef7411cf44d..9a25d7c4912bacc49c727c09958c1daaaf5c7c0c 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -30,7 +30,7 @@ import paddle import paddle.distributed as dist from paddle.distributed import ParallelMode, fleet from paddle.fluid import core -from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.nn import ClipGradByGlobalNorm from paddle.optimizer import Optimizer HybridParallelClipGrad = ( diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index 3d3debb252d400ddf3962f064682cf1b829af131..d99683d481450309d95d13dfb26b0bc3471ea5e3 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -25,8 +25,8 @@ import paddle.fluid.framework as framework from paddle import nn from paddle.autograd import PyLayer from paddle.distributed import collective -from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.fluid.framework import EagerParamBase +from paddle.nn import ClipGradByGlobalNorm from .group_sharded_storage import GradStorage from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 620540fea58761f8930b33bd8d65f6bafc7ff369..f8c86e02b7b52490dde4ad3c69068b9709c39250 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -23,6 +23,7 @@ from paddle import _legacy_C_ops from paddle.fluid import core, layers from paddle.fluid.dygraph import to_variable from paddle.fluid.framework import dygraph_only +from paddle.nn import clip class Taskflow: @@ -65,8 +66,8 @@ class GroupShardedClipGrad: merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.get_tensor_from_selected_rows( - layers.merge_selected_rows(g) + merge_grad = clip.get_tensor_from_selected_rows( + clip.merge_selected_rows(g) ) square = paddle.square(merge_grad) sum_square = paddle.sum(square) diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py index 8c15e47307381d862b15518cf860e34d4f9c4280..39284fa9f5a3f151747547b42409385d470571cd 100644 --- a/python/paddle/distributed/fleet/metrics/metric.py +++ b/python/paddle/distributed/fleet/metrics/metric.py @@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None): .. code-block:: python # in model.py - similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0)) + similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0)) binary_predict = fluid.layers.concat( input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1) self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] = diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 53d50a8b4a3ed378aa203f9458a4dc440e080716..eaf64e6dc6c0bd3c4bd0f3642f32953e52a81ea3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -90,7 +90,6 @@ from .transpiler import ( DistributeTranspilerConfig, ) from .lod_tensor import create_lod_tensor, create_random_int_lodtensor -from . import clip from . import profiler from . import unique_name from . import parallel_executor @@ -99,7 +98,6 @@ from . import compiler from .compiler import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable from . import install_check -from .dygraph.nn import * from .dygraph.layers import * from .dygraph.base import enable_dygraph, disable_dygraph from .io import save, load, load_program_state, set_program_state @@ -165,7 +163,6 @@ __all__ = ( 'ParamAttr', 'WeightNormParamAttr', 'DataFeeder', - 'clip', 'profiler', 'unique_name', 'Scope', diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py deleted file mode 100644 index ffaa84ed3e53c5aadbb6dc3e8d51a48bc00a9fb6..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/clip.py +++ /dev/null @@ -1,944 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import warnings - -import functools -import paddle -from . import layers -from . import framework -from . import core -from . import name_scope -from .dygraph import base as imperative_base -from .data_feeder import check_variable_and_dtype -from .framework import in_dygraph_mode -from .layer_helper import LayerHelper -from .framework import default_main_program -from paddle import _C_ops, _legacy_C_ops - -__all__ = [ - 'set_gradient_clip', - 'ErrorClipByValue', - 'ClipGradByValue', - 'ClipGradByNorm', - 'ClipGradByGlobalNorm', -] - -_clip_by_global_norm_using_mp_type_flag = False - - -def _clip_by_global_norm_using_mp_type(*args): - global _clip_by_global_norm_using_mp_type_flag - assert len(args) <= 1 - if len(args) == 1: - assert isinstance(args[0], bool) - old_value = _clip_by_global_norm_using_mp_type_flag - _clip_by_global_norm_using_mp_type_flag = args[0] - return old_value - else: - return _clip_by_global_norm_using_mp_type_flag - - -def _cast_to_mp_type_if_enabled(x): - if ( - x.dtype == core.VarDesc.VarType.FP16 - or x.dtype == core.VarDesc.VarType.BF16 - ) and _clip_by_global_norm_using_mp_type(): - return x.astype(core.VarDesc.VarType.FP32) - else: - return x - - -def _squared_l2_norm(x): - r""" - This OP returns the squared L2 norm of a tensor. - """ - - x = _cast_to_mp_type_if_enabled(x) - if ( - core.is_compiled_with_xpu() - or x.dtype == core.VarDesc.VarType.FP16 - or x.dtype == core.VarDesc.VarType.BF16 - ): - square = paddle.square(x) - sum_square = paddle.sum(square) - return sum_square - - if in_dygraph_mode(): - return _C_ops.squared_l2_norm(x) - else: - op_type = 'squared_l2_norm' - check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type) - helper = LayerHelper(op_type, **locals()) - out = helper.create_variable_for_type_inference(x.dtype) - - inputs = {"X": x} - outputs = {'Out': out} - helper.append_op(type=op_type, inputs=inputs, outputs=outputs) - return out - - -class BaseErrorClipAttr: - def __str__(self): - raise NotImplementedError() - - def _append_clip_op(self, block, grad_name): - raise NotImplementedError() - - -class ErrorClipByValue(BaseErrorClipAttr): - r""" - Clips tensor values to the range [min, max]. - - Given a tensor ``t`` (see Examples below), this operation clips its value \ - to ``min`` and ``max`` inplace. - - - Any values less than min are set to min. - - Any values greater than max are set to max. - - Args: - max (float): The maximum value to clip by. - min (float, optional): The minimum value to clip by. if not set by user, \ - will be set to ``-max`` by framework. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - BATCH_SIZE = 128 - CLIP_MAX = 2e-6 - CLIP_MIN = -1e-6 - prog = fluid.framework.Program() - with fluid.program_guard(main_program=prog): - image = fluid.layers.data( - name='x', shape=[784], dtype='float32') - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') - predict = fluid.layers.fc( - input=hidden2, size=10, act='softmax') - label = fluid.layers.data(name='y', shape=[1], dtype='int64') - cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False) - avg_cost = paddle.mean(cost) - prog_clip = prog.clone() - prog_clip.block(0).var(hidden1.name)._set_error_clip( - fluid.clip.ErrorClipByValue( - max=CLIP_MAX, min=CLIP_MIN - ) - ) - """ - - def __init__(self, max, min=None): - max = float(max) - if min is None: - min = -max - else: - min = float(min) - self.max = max - self.min = min - - def __str__(self): - return "ByValue, min=%f, max=%f" % (self.min, self.max) - - def _append_clip_op(self, block, grad_name): - clip_op_desc = block.desc.append_op() - clip_op_desc.set_type("clip") - clip_op_desc.set_input("X", [grad_name]) - clip_op_desc.set_output("Out", [grad_name]) - clip_op_desc._set_attr("min", self.min) - clip_op_desc._set_attr("max", self.max) - - -def error_clip_callback(block, context): - # the context is a grad_to_var map - grad_to_var = context - op_desc = block.desc.op(block.desc.op_size() - 1) - for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]: - fwd_var = block._var_recursive(grad_to_var[grad_n]) - error_clip = getattr(fwd_var, "error_clip", None) - if not ( - error_clip is None or isinstance(error_clip, BaseErrorClipAttr) - ): - raise TypeError( - "Variable's error_clip should be an instance of BaseErrorClipAttr or None." - ) - if error_clip is not None: - error_clip._append_clip_op(block, grad_n) - - -class ClipGradBase: - def __init__(self): - super().__init__() - - def __str__(self): - raise NotImplementedError() - - @imperative_base.no_grad - def _dygraph_clip(self, params_grads): - raise NotImplementedError - - def _static_clip(self, params_grads): - raise NotImplementedError - - def __call__(self, params_grads): - if in_dygraph_mode(): - return self._dygraph_clip(params_grads) - else: - for p, g in params_grads: - if getattr(p, 'gradient_clip_attr', None) is not None: - warnings.warn( - "'set_gradient_clip' will be ineffective, because you have " - "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' " - "is redundant and you can remove it." - ) - break - return self._static_clip(params_grads) - - def _process_context(self, context, param, grad): - raise NotImplementedError() - - def _create_operators(self, param, grad): - raise NotImplementedError() - - -class ClipGradByValue(ClipGradBase): - """ - Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max]. - - - Any values less than min are set to ``min``. - - - Any values greater than max are set to ``max``. - - The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. - If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. - - Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` - (for example: :ref:`api_paddle_optimizer_SGD`). - - Note: - ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. - Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. - - Args: - max (float): The maximum value to clip by. - min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` - automatically. In this case, ``max`` must be greater than 0. - - Examples: - .. code-block:: python - - import paddle - - x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(in_features=10, out_features=10, - weight_attr=paddle.ParamAttr(need_clip=True), - bias_attr=paddle.ParamAttr(need_clip=False)) - out = linear(x) - loss = paddle.mean(out) - loss.backward() - - clip = paddle.nn.ClipGradByValue(min=-1, max=1) - sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) - sdg.step() - """ - - def __init__(self, max, min=None): - super().__init__() - if min is None: - assert max > 0.0 - min = -max - self.max = float(max) - self.min = float(min) - - def __str__(self): - return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max) - - @imperative_base.no_grad - def _dygraph_clip(self, params_grads): - params_and_grads = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - new_grad = paddle.clip(x=g, min=self.min, max=self.max) - params_and_grads.append((p, new_grad)) - return params_and_grads - - def _static_clip(self, params_grads): - params_and_grads = [] - param_new_grad_name_dict = dict() - with framework.name_scope('gradient_clip'): - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - - with p.block.program._optimized_guard([p, g]): - new_grad = layers.clip(x=g, min=self.min, max=self.max) - params_and_grads.append((p, new_grad)) - param_new_grad_name_dict[p.name] = new_grad.name - _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) - return params_and_grads - - def _process_context(self, context, param, grad): - pass - - def _create_operators(self, param, grad): - new_grad = layers.clip(x=grad, min=self.min, max=self.max) - return param, new_grad - - -class ClipGradByNorm(ClipGradBase): - r""" - Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` . - - - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio. - - - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done. - - The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. - If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. - - Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` - (for example: :ref:`api_paddle_optimizer_SGD`). - - The clipping formula is: - - .. math:: - Out = - \left\{ - \begin{array}{ccl} - X & & if (norm(X) \leq clip\_norm) \\ - \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\ - \end{array} - \right. - - - where :math:`norm(X)` represents the L2 norm of :math:`X`. - - .. math:: - norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}} - - Note: - ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. - Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. - - Args: - clip_norm(float): The maximum norm value. - - Examples: - .. code-block:: python - - import paddle - - x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(in_features=10, out_features=10, - weight_attr=paddle.ParamAttr(need_clip=True), - bias_attr=paddle.ParamAttr(need_clip=False)) - out = linear(x) - loss = paddle.mean(out) - loss.backward() - - clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) - sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) - sdg.step() - """ - - def __init__(self, clip_norm): - super().__init__() - self.clip_norm = float(clip_norm) - - def __str__(self): - return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm - - @imperative_base.no_grad - def _dygraph_clip(self, params_grads): - params_and_grads = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) - params_and_grads.append((p, new_grad)) - return params_and_grads - - def _static_clip(self, params_grads): - params_and_grads = [] - with framework.name_scope('gradient_clip'): - param_new_grad_name_dict = dict() - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - - with p.block.program._optimized_guard([p, g]): - new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) - param_new_grad_name_dict[p.name] = new_grad.name - params_and_grads.append((p, new_grad)) - _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) - return params_and_grads - - def _process_context(self, context, param, grad): - pass - - def _create_operators(self, param, grad): - new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm) - return param, new_grad - - -_allow_pure_fp16_global_norm_clip_flag = False - - -def _allow_pure_fp16_global_norm_clip(*args): - global _allow_pure_fp16_global_norm_clip_flag - if len(args) == 0: - return _allow_pure_fp16_global_norm_clip_flag - else: - assert len(args) == 1 and isinstance(args[0], bool) - old_value = _allow_pure_fp16_global_norm_clip_flag - _allow_pure_fp16_global_norm_clip_flag = args[0] - return old_value - - -class ClipGradByGlobalNorm(ClipGradBase): - r""" - Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in - :math:`t\_list` , and limit it to ``clip_norm`` . - - - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio. - - - If the global norm is less than or equal to ``clip_norm`` , nothing will be done. - - The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``. - If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. - - Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` - (for example: :ref:`api_paddle_optimizer_SGD`). - - The clipping formula is: - - .. math:: - - t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)} - - where: - - .. math:: - - global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} - - Note: - ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. - Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. - - Args: - clip_norm (float): The maximum norm value. - group_name (str, optional): The group name for this clip. Default value is ``default_group``. - - Examples: - .. code-block:: python - - import paddle - - x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(in_features=10, out_features=10, - weight_attr=paddle.ParamAttr(need_clip=True), - bias_attr=paddle.ParamAttr(need_clip=False)) - out = linear(x) - loss = paddle.mean(out) - loss.backward() - - clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) - sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) - sdg.step() - """ - - def __init__( - self, clip_norm, group_name="default_group", auto_skip_clip=False - ): - super().__init__() - self.clip_norm = float(clip_norm) - self.group_name = group_name - assert isinstance(auto_skip_clip, bool) - self.auto_skip_clip = auto_skip_clip - - def __str__(self): - return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) - - @imperative_base.no_grad - def _dygraph_clip(self, params_grads): - params_and_grads = [] - sum_square_list = [] - sum_square_list_fp16 = [] - sum_square_list_fp32 = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - continue - merge_grad = g - - if in_dygraph_mode() and g.is_selected_rows(): - merge_grad = layers.merge_selected_rows(g) - merge_grad = merge_grad._get_tensor_from_selected_rows() - - elif g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - - sum_square = _squared_l2_norm(merge_grad) - if ( - sum_square.dtype == core.VarDesc.VarType.FP16 - or sum_square.dtype == core.VarDesc.VarType.BF16 - ): - sum_square_list_fp16.append(sum_square) - elif sum_square.dtype == core.VarDesc.VarType.FP32: - sum_square_list_fp32.append(sum_square) - else: - sum_square_list.append(sum_square) - - # all parameters have been filterd out - if ( - len(sum_square_list) - + len(sum_square_list_fp16) - + len(sum_square_list_fp32) - == 0 - ): - return params_grads - - sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" - global_norm_var = [] - if len(sum_square_list_fp16) > 0: - global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16) - global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) - if len(sum_square_list_fp32) > 0: - global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32) - if sum_dtype == 'float32': - global_norm_var.append(global_norm_var_fp32) - else: - global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) - if len(sum_square_list) > 0: - global_norm_var_fp64 = paddle.add_n(sum_square_list) - global_norm_var.append(global_norm_var_fp64) - global_norm_var = paddle.add_n(global_norm_var) - global_norm_var = paddle.sqrt(global_norm_var) - max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm - ) - - need_clip = False - if not self.auto_skip_clip: # always apply clip - need_clip = True - clip_var = paddle.divide( - x=max_global_norm, - y=paddle.maximum(x=global_norm_var, y=max_global_norm), - ) - elif global_norm_var > max_global_norm: - # only when global_norm_var > max_global_norm, grad need clip - need_clip = True - clip_var = paddle.divide(x=max_global_norm, y=global_norm_var) - - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - # TODO(wangxi): use inplace elementwise_mul - if need_clip: - clip_input = ( - clip_var.astype(g.dtype) - if clip_var.dtype != g.dtype - else clip_var - ) - new_grad = paddle.multiply(g, clip_input) - params_and_grads.append((p, new_grad)) - else: - params_and_grads.append((p, g)) - - return params_and_grads - - def _static_clip(self, params_grads): - params_and_grads = [] - sum_square_list = [] - sum_square_list_fp16 = [] - sum_square_list_fp32 = [] - with framework.name_scope('gradient_clip'): - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - continue - merge_grad = g - with p.block.program._optimized_guard([p, g]): - if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows( - merge_grad - ) - sum_square = _squared_l2_norm(merge_grad) - if sum_square.dtype == core.VarDesc.VarType.FP16: - sum_square_list_fp16.append(sum_square) - elif sum_square.dtype == core.VarDesc.VarType.FP32: - sum_square_list_fp32.append(sum_square) - else: - sum_square_list.append(sum_square) - - # all parameters have been filterd out - if ( - len(sum_square_list) - + len(sum_square_list_fp16) - + len(sum_square_list_fp32) - == 0 - ): - return params_grads - - with p.block.program._optimized_guard([p, g]): - sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" - - global_norm_var = [] - if len(sum_square_list_fp16) > 0: - global_norm_var_fp16 = layers.sums(sum_square_list_fp16) - if ( - sum_square_list_fp32 - or sum_square_list - or not _allow_pure_fp16_global_norm_clip() - ): - global_norm_var.append( - global_norm_var_fp16.astype(sum_dtype) - ) - else: - global_norm_var.append(global_norm_var_fp16) - if len(sum_square_list_fp32) > 0: - global_norm_var_fp32 = layers.sums(sum_square_list_fp32) - if sum_dtype == 'float32': - global_norm_var.append(global_norm_var_fp32) - else: - global_norm_var.append( - global_norm_var_fp32.astype(sum_dtype) - ) - if len(sum_square_list) > 0: - # fp64 - global_norm_var_other_dtype = layers.sums(sum_square_list) - global_norm_var.append(global_norm_var_other_dtype) - - global_norm_var = ( - layers.sums(global_norm_var) - if len(global_norm_var) > 1 - else global_norm_var[0] - ) - global_norm_var = paddle.sqrt(x=global_norm_var) - max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm - ) - scale_var = paddle.divide( - x=max_global_norm, - y=paddle.maximum(x=max_global_norm, y=global_norm_var), - ) - param_new_grad_name_dict = dict() - for p, g in params_grads: - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - - with p.block.program._optimized_guard([p, g]): - new_g = _cast_to_mp_type_if_enabled(g) - # inplace - scale_input = ( - scale_var.astype('float16') - if new_g.dtype == core.VarDesc.VarType.FP16 - and scale_var.dtype != core.VarDesc.VarType.FP16 - else scale_var - ) - # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g - # will be in different blocks with the gradient clip related ops. - # We need to handle the correct block, otherwise will encounter - # a 'NotFoundError' during compile time. - block = default_main_program().current_block() - block.append_op( - type='elementwise_mul', - inputs={'X': new_g, 'Y': scale_input}, - outputs={'Out': new_g}, - ) - if new_g is not g: - block.append_op( - type='cast', - inputs={'X': new_g}, - outputs={'Out': g}, - attrs={ - 'in_dtype': new_g.dtype, - 'out_dtype': g.dtype, - }, - ) - - param_new_grad_name_dict[p.name] = g.name - params_and_grads.append((p, g)) - - _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) - return params_and_grads - - def _process_context(self, context, param, grad): - if self.group_name not in context: - context[self.group_name] = [] - context[self.group_name + "_clip_value"] = self.clip_norm - context[self.group_name + "_clip"] = layers.fill_constant( - shape=[1], dtype=grad.dtype, value=self.clip_norm - ) - else: - if not self.clip_norm == context[self.group_name + "_clip_value"]: - raise ValueError( - "All parameters' 'clip_norm' of a same group should be the same" - ) - - merge_grad = grad - if grad.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(grad) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - - local_norm_var = _squared_l2_norm(merge_grad) - context[self.group_name].append(local_norm_var) - - self.context = context - - def _create_operators(self, param, grad): - group_scale_name = self.group_name + "_scale" - if group_scale_name not in self.context: - group_norm_var = layers.sums(input=self.context[self.group_name]) - group_norm_var = paddle.sqrt(x=group_norm_var) - clip_var = self.context[self.group_name + "_clip"] - group_scale_var = paddle.divide( - x=clip_var, - y=paddle.maximum(x=clip_var, y=group_norm_var), - ) - assert group_scale_var.shape == (1,) - self.context[group_scale_name] = group_scale_var - - # inplace - param.block.append_op( - type='elementwise_mul', - inputs={'X': grad, 'Y': self.context[group_scale_name]}, - outputs={'Out': grad}, - ) - - return param, grad - - -@framework.dygraph_not_support -def set_gradient_clip(clip, param_list=None, program=None): - """ - :api_attr: Static Graph - - Warning: - - This API must be used after building network, and before ``minimize`` , - and it may be removed in future releases, so it is not recommended. - It is recommended to set ``grad_clip`` when initializing the ``optimizer`` , - this is a better method to clip gradient. There are three clipping strategies: - :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` . - - To specify parameters that require gradient clip. - - Args: - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no - gradient clipping. - param_list (list(Variable), optional): Parameters that require gradient clip. - It can be a list of parameter or a list of parameter's name. - Default None, meaning that all parameters in the program will be included. - program (Program, optional): The program where parameters are located. - Default None, meaning that using :ref:`api_fluid_default_main_program` . - - Returns: - None - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - def network(): - image = fluid.data(name='image', shape=[ - None, 28], dtype='float32') - param_attr1 = fluid.ParamAttr("fc1_param") - fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1) - param_attr2 = fluid.ParamAttr("fc2_param") - fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2) - loss = fluid.layers.reduce_mean(fc2) - return loss - - - # network 1: clip all parameter gradient - with fluid.program_guard(fluid.Program(), fluid.Program()): - loss = network() - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0)) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) - sgd.minimize(loss) - - # network 2: clip parameter gradient by name - with fluid.program_guard(fluid.Program(), fluid.Program()): - loss = network() - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByValue(min=-1.0, max=1.0), - param_list=["fc1_param", "fc2_param"]) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) - sgd.minimize(loss) - - # network 3: clip parameter gradient by value - with fluid.program_guard(fluid.Program(), fluid.Program()): - loss = network() - param_var1 = fluid.default_main_program().global_block().var("fc1_param") - param_var2 = fluid.default_main_program().global_block().var("fc2_param") - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByValue(min=-1.0, max=1.0), - param_list=[param_var1, param_var2]) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) - sgd.minimize(loss) - - # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together - with fluid.program_guard(fluid.Program(), fluid.Program()): - loss = network() - clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0) - clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0) - # Set the gradient clipping strategy: clip1 - fluid.clip.set_gradient_clip(clip1) - # Set the gradient clipping strategy: clip2 - sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2) - sgd.minimize(loss) - # 'set_gradient_clip' will not take effect when setting has a conflict, - # and the gradient clipping strategy will be 'clip2' - - - """ - warnings.warn( - "Caution! 'set_gradient_clip' is not recommended " - "and may be deprecated in future! " - "We recommend a new strategy: set 'grad_clip' " - "when initializing the 'optimizer'. " - "This method can reduce the mistakes, please " - "refer to documention of 'optimizer'." - ) - - if not isinstance(clip, ClipGradBase): - raise TypeError( - "'clip' should be an instance of ClipGradBase's derived class" - ) - if program is None: - program = framework.default_main_program() - - for op in program.block(0).ops: - if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr( - "op_namescope" - ): - warnings.warn( - "'minimize' has been invoked before, this will make 'set_gradient_clip' " - "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'." - ) - break - - if param_list is None: - param_list = program.block(0).all_parameters() - if all(isinstance(elem, str) for elem in param_list): - param_list = [program.block(0).var(elem) for elem in param_list] - if not all(isinstance(elem, framework.Parameter) for elem in param_list): - raise TypeError( - "'param_list' should be a list of Parameter or basestring(parameter's name)." - ) - - for param in param_list: - param.gradient_clip_attr = copy.deepcopy(clip) - - -def append_gradient_clip_ops(param_grads): - context = dict() - for p, g in param_grads: - if g is None: - continue - with p.block.program._optimized_guard([p, g]), framework.name_scope( - 'gradient_clip' - ): - clip_attr = getattr(p, 'gradient_clip_attr', None) - if clip_attr is None: - return param_grads - if not isinstance(clip_attr, ClipGradBase): - raise TypeError( - "clip attribute should be an instance of GradientClipBase" - ) - - clip_attr._process_context(context=context, param=p, grad=g) - - res = [] - param_new_grad_name_dict = dict() - for p, g in param_grads: - if g is None: - continue - with p.block.program._optimized_guard([p, g]), framework.name_scope( - 'gradient_clip' - ): - param, new_grad = clip_attr._create_operators(param=p, grad=g) - param_new_grad_name_dict[param.name] = new_grad.name - res.append([param, new_grad]) - - _correct_clip_op_role_var(res, param_new_grad_name_dict) - return res - - -# change wrong mapping relation between param & grad in clip op -# Note: This function is sensitive to the time cost of the network with gradient clipping -# and should not be changed easily. If you must change, please test the time cost. -def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): - block_id_list = [] - if len(param_new_grad_name_dict) == 0: - return - for param, grad in params_grads: - if grad is None: - continue - block_id = param.block.idx - if block_id in block_id_list: - continue - block_id_list.append(block_id) - for op in param.block.program.global_block().ops: - if ( - op.has_attr("op_namescope") - and "gradient_clip" in op.attr("op_namescope") - and op.attr('op_role_var') - ): - param_name = op.attr('op_role_var')[0] - if param_name in param_new_grad_name_dict: - correct_p_g = [ - param_name, - param_new_grad_name_dict[param_name], - ] - op._set_attr('op_role_var', correct_p_g) - - -GradientClipBase = ClipGradBase -GradientClipByValue = ClipGradByValue -GradientClipByNorm = ClipGradByNorm -GradientClipByGlobalNorm = ClipGradByGlobalNorm diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index aebcc09eaa14ba8242b7ea4041f5816a5259cc7c..b98c188ae4f6ab3ecd191940431bb86d84ddccc7 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -21,9 +21,6 @@ from .layers import * from . import container from .container import * -from . import nn -from .nn import * - from . import tracer from .tracer import * @@ -45,7 +42,6 @@ __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ __all__ += container.__all__ -__all__ += nn.__all__ __all__ += parallel.__all__ __all__ += checkpoint.__all__ __all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py deleted file mode 100644 index f6009912bee9062a4a8478237a41c9168af50782..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/dygraph/nn.py +++ /dev/null @@ -1,322 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from .. import core -from ..layers import utils -from ..layers import nn as F -from .. import dygraph_utils -from . import layers -from ..framework import ( - Variable, - OpProtoHolder, - Parameter, - _dygraph_tracer, - _varbase_creator, - default_main_program, - _global_flags, - in_dygraph_mode, -) - -from ..data_feeder import ( - convert_dtype, - check_variable_and_dtype, - check_type, - check_dtype, -) - -from ..param_attr import ParamAttr -from ..initializer import Normal, Constant, NumpyArrayInitializer -from .. import unique_name -from .layer_object_helper import LayerObjectHelper -from ..data_feeder import check_variable_and_dtype, check_type -import numpy as np -import numbers -import logging -import os -import paddle.utils.deprecated as deprecated -from paddle import _C_ops, _legacy_C_ops - -__all__ = [] - - -class BatchNorm(layers.Layer): - r""" - - This interface is used to construct a callable object of the ``BatchNorm`` class. - For more details, refer to code examples. - It implements the function of the Batch Normalization Layer and can be used - as a normalizer function for conv2d and fully connected operations. - The data is normalized by the mean and variance of the channel based on the current batch data. - Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing - Internal Covariate Shift `_ - for more details. - - When use_global_stats = False, the :math:`\mu_{\beta}` - and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. - Calculated as follows: - - .. math:: - - \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad & - //\ mini-batch\ mean \\ - \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad & - //\ mini-batch\ variance \\ - - - :math:`x` : mini-batch data - - :math:`m` : the size of the mini-batch data - - When use_global_stats = True, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. - They are global or running statistics (moving_mean and moving_variance). It usually got from the - pre-trained model. Calculated as follows: - - .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ - - The normalization function formula is as follows: - - .. math:: - - \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ - \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ - y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - - - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\gamma` : trainable proportional parameter - - :math:`\beta` : trainable deviation parameter - - Parameters: - num_channels(int): Indicate the number of channels of the input ``Tensor``. - act(str, optional): Activation to be applied to the output of batch normalization. Default: None. - is_test (bool, optional): A flag indicating whether it is in test phrase or not. - This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``. - Default: False. - momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. - epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. - param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` - of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. - If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - dtype(str, optional): Indicate the data type of the input ``Tensor``, - which can be float32 or float64. Default: float32. - data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW. - in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False. - moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None. - moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None. - do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model - average when model average is enabled. Default: True. - use_global_stats(bool, optional): Whether to use global mean and - variance. In inference or test mode, set use_global_stats to true - or is_test to true, and the behavior is equivalent. - In train mode, when setting use_global_stats True, the global mean - and variance are also used during train period. Default: False. - trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when - setting trainable_statistics True, mean and variance will be calculated by current batch statistics. - Default: False. - - Returns: - None - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.fluid.dygraph.base import to_variable - - x = paddle.rand([3, 10, 3, 7], 'float32') - with fluid.dygraph.guard(): - x = to_variable(x) - batch_norm = fluid.BatchNorm(10) - hidden1 = batch_norm(x) - """ - - def __init__( - self, - num_channels, - act=None, - is_test=False, - momentum=0.9, - epsilon=1e-05, - param_attr=None, - bias_attr=None, - dtype='float32', - data_layout='NCHW', - in_place=False, - moving_mean_name=None, - moving_variance_name=None, - do_model_average_for_mean_and_var=True, - use_global_stats=False, - trainable_statistics=False, - ): - super().__init__() - self._param_attr = param_attr - self._bias_attr = bias_attr - self._act = act - self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] - - assert ( - bias_attr is not False - ), "bias_attr should not be False in batch_norm." - - if dtype == "float16": - self._dtype = "float32" - else: - self._dtype = dtype - - param_shape = [num_channels] - - # create parameter - self.weight = self.create_parameter( - attr=self._param_attr, - shape=param_shape, - dtype=self._dtype, - default_initializer=Constant(1.0), - ) - self.weight.stop_gradient = ( - use_global_stats and self._param_attr.learning_rate == 0.0 - ) - - self.bias = self.create_parameter( - attr=self._bias_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=True, - ) - self.bias.stop_gradient = ( - use_global_stats and self._param_attr.learning_rate == 0.0 - ) - - self._mean = self.create_parameter( - attr=ParamAttr( - name=moving_mean_name, - initializer=Constant(0.0), - trainable=False, - do_model_average=do_model_average_for_mean_and_var, - ), - shape=param_shape, - dtype=self._dtype, - ) - self._mean.stop_gradient = True - - self._variance = self.create_parameter( - attr=ParamAttr( - name=moving_variance_name, - initializer=Constant(1.0), - trainable=False, - do_model_average=do_model_average_for_mean_and_var, - ), - shape=param_shape, - dtype=self._dtype, - ) - self._variance.stop_gradient = True - - self._in_place = in_place - self._data_layout = data_layout - self._momentum = momentum - self._epsilon = epsilon - self._is_test = is_test - self._fuse_with_relu = False - self._use_global_stats = use_global_stats - self._trainable_statistics = trainable_statistics - - def forward(self, input): - # create output - # mean and mean_out share the same memory - mean_out = self._mean - # variance and variance out share the same memory - variance_out = self._variance - - if in_dygraph_mode(): - batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm( - input, - self._mean, - self._variance, - self.weight, - self.bias, - not self.training, - self._momentum, - self._epsilon, - self._data_layout, - self._use_global_stats, - self._trainable_statistics, - ) - return dygraph_utils._append_activation_in_dygraph( - batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn - ) - else: - check_variable_and_dtype( - input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm' - ) - - attrs = { - "momentum": self._momentum, - "epsilon": self._epsilon, - "is_test": self._is_test, - "data_layout": self._data_layout, - "use_mkldnn": False, - "fuse_with_relu": self._fuse_with_relu, - "use_global_stats": self._use_global_stats, - "trainable_statistics": self._trainable_statistics, - } - - inputs = { - "X": [input], - "Scale": [self.weight], - "Bias": [self.bias], - "Mean": [self._mean], - "Variance": [self._variance], - } - - saved_mean = self._helper.create_variable_for_type_inference( - dtype=self._dtype, stop_gradient=True - ) - saved_variance = self._helper.create_variable_for_type_inference( - dtype=self._dtype, stop_gradient=True - ) - reserve_space = self._helper.create_variable_for_type_inference( - dtype=self._helper.input_dtype(input), stop_gradient=True - ) - - batch_norm_out = ( - input - if self._in_place - else self._helper.create_variable_for_type_inference( - self._dtype - ) - ) - - outputs = { - "Y": [batch_norm_out], - "MeanOut": [mean_out], - "VarianceOut": [variance_out], - "SavedMean": [saved_mean], - "SavedVariance": [saved_variance], - } - if reserve_space is not None: - outputs["ReserveSpace"] = [reserve_space] - - self._helper.append_op( - type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs - ) - - # Currently, we don't support inplace in dygraph mode - return self._helper.append_activation(batch_norm_out, self._act) diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py index df198931199f59520368faee025a77b42b5bdcd7..4ec3c1d16e077ea00672c664bac3b1b4ea5e491c 100644 --- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py +++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py @@ -185,7 +185,7 @@ class FleetUtil: # below is part of model emb = my_slot_net(slots, label) # emb can be fc layer of size 1 - similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ + similarity_norm = fluid.layers.sigmoid(paddle.clip(\ emb, min=-15.0, max=15.0), name="similarity_norm")\ binary_predict = fluid.layers.concat(input=[\ paddle.subtract(\ @@ -1374,7 +1374,7 @@ class FleetUtil: label = fluid.layers.data(name="click", shape=[-1, 1],\ dtype="int64", lod_level=0, append_batch_size=False) emb = my_slot_net(slots, label) # emb can be fc layer of size 1 - similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ + similarity_norm = fluid.layers.sigmoid(paddle.clip(\ emb, min=-15.0, max=15.0), name="similarity_norm")\ binary_predict = fluid.layers.concat(input=[\ paddle.subtract(\ @@ -1574,7 +1574,7 @@ class FleetUtil: label = fluid.layers.data(name="click", shape=[-1, 1],\ dtype="int64", lod_level=0, append_batch_size=False) emb = my_slot_net(slots, label) # emb can be fc layer of size 1 - similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ + similarity_norm = fluid.layers.sigmoid(paddle.clip(\ emb, min=-15.0, max=15.0), name="similarity_norm")\ binary_predict = fluid.layers.concat(input=[\ paddle.subtract(\ diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index 5756361f89e46f005072d2136b2e13de4762525b..bf1ad9b107f74694c80472f583287d617fdf0616 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -25,7 +25,7 @@ from .param_attr import ParamAttr from .initializer import Constant from . import layers from . import backward -from .dygraph import Layer, nn +from .dygraph import Layer from . import executor from . import optimizer from . import core diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 264c8ce6da94e8af7fb1d3b27c429880983f8bf8..c11a541df5326794a72390086442664aee26a142 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -63,10 +63,6 @@ __all__ = [ 'fc', 'embedding', 'autoincreased_step_counter', - 'clip', - 'clip_by_norm', - 'merge_selected_rows', - 'get_tensor_from_selected_rows', ] OP_NAMEMAPPING = { @@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): ) return out - - -@templatedoc() -def clip(x, min, max, name=None): - """ - :old_api: paddle.fluid.layers.clip - - ${comment} - - Args: - x(${x_type}): ${x_comment} - min(float): ${min_comment} - max(float): ${max_comment} - name(str, optional): The default value is None. - Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name` - - Returns: - ${out_comment} - - Return Type: - ${out_type} - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - input = fluid.data( - name='data', shape=[1], dtype='float32') - reward = fluid.layers.clip(x=input, min=-1.0, max=1.0) - """ - - helper = LayerHelper("clip", **locals()) - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip') - - if name is None: - name = unique_name.generate_with_ignorable_key( - ".".join([helper.name, 'tmp']) - ) - - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False - ) - - helper.append_op( - type="clip", - inputs={"X": x}, - attrs={"min": min, "max": max}, - outputs={"Out": out}, - ) - - return out - - -@templatedoc() -def clip_by_norm(x, max_norm, name=None): - """ - ${comment} - - Args: - x(${x_type}): ${x_comment} - max_norm(${max_norm_type}): ${max_norm_comment} - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: - - out(${out_type}): ${out_comment} - - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32') - reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0) - # [[0.5, 0.5], [0.5, 0.5]] - """ - - if in_dygraph_mode(): - return _C_ops.clip_by_norm(x, max_norm) - else: - helper = LayerHelper("clip_by_norm", **locals()) - check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm') - check_type(max_norm, 'max_norm', (float), 'clip_by_norm') - - if name is None: - name = unique_name.generate_with_ignorable_key( - ".".join([helper.name, 'tmp']) - ) - - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False - ) - - helper.append_op( - type="clip_by_norm", - inputs={"X": x}, - attrs={"max_norm": max_norm}, - outputs={"Out": out}, - ) - - return out - - -@templatedoc() -def merge_selected_rows(x, name=None): - """ - ${comment} - - Args: - x(${x_type}): ${x_comment} - name(basestring|None): Name of the output. - - Returns: - out(${out_type}): ${out_comment} - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - b = fluid.default_main_program().global_block() - var = b.create_var( - name="X", dtype="float32", persistable=True, - type=fluid.core.VarDesc.VarType.SELECTED_ROWS) - y = fluid.layers.merge_selected_rows(var) - """ - if in_dygraph_mode(): - return _C_ops.merge_selected_rows(x) - else: - helper = LayerHelper("merge_selected_rows", **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type="merge_selected_rows", - inputs={"X": x}, - attrs={}, - outputs={"Out": out}, - ) - return out - - -@templatedoc() -def get_tensor_from_selected_rows(x, name=None): - """ - This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor. - - .. code-block:: text - - input x is SelectedRows: - x.rows = [0, 5, 5, 4, 19] - x.height = 20 - x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]] - - Output is LoDTensor: - out.shape = [5, 2] - out.data = [[1, 1], - [2, 2], - [2, 2], - [3, 3], - [6, 6]] - - Args: - x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64. - name(str, optional): The default value is None. Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Variable: LoDTensor transformed from SelectedRows. The data type is same with input. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - b = fluid.default_main_program().global_block() - input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS) - out = fluid.layers.get_tensor_from_selected_rows(input) - """ - - check_type(x, 'x', Variable, 'get_tensor_from_selected_rows') - if x.type != core.VarDesc.VarType.SELECTED_ROWS: - raise TypeError( - "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS." - ) - helper = LayerHelper('get_tensor_from_selected_rows', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='get_tensor_from_selected_rows', - inputs={'X': x}, - outputs={'Out': out}, - attrs={}, - ) - return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 3e59ca2472ab4decea0dc6f93cbb5fa1492ec0c4..cbbe8dbadef12f6584371c7fcd500b2c3b0b7c5a 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -38,13 +38,6 @@ from .backward import ( _append_grad_suffix_, _get_no_grad_set_name, ) -from .clip import ( - GradientClipBase, - GradientClipByNorm, - error_clip_callback, - append_gradient_clip_ops, - ClipGradByGlobalNorm, -) from .framework import program_guard from .initializer import Constant from .layer_helper import LayerHelper @@ -160,7 +153,7 @@ class Optimizer: ) if grad_clip is not None: - if not isinstance(grad_clip, GradientClipBase): + if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase): raise TypeError( "'grad_clip' should be an instance of GradientClipBase's derived class" ) @@ -1030,7 +1023,7 @@ class Optimizer: params_grads.append((param, grad_var)) else: if callbacks is None: - callbacks = [error_clip_callback] + callbacks = [paddle.nn.clip.error_clip_callback] else: assert isinstance(callbacks, list) program = loss.block.program @@ -1260,7 +1253,7 @@ class Optimizer: # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization. if self._flatten_param_grads and self.regularization is None: if self._grad_clip is None or isinstance( - self._grad_clip, ClipGradByGlobalNorm + self._grad_clip, paddle.nn.ClipGradByGlobalNorm ): params_grads = self.flatten_param_grads(params_grads) @@ -1268,7 +1261,7 @@ class Optimizer: if self._grad_clip is not None: params_grads = self._grad_clip(params_grads) else: - params_grads = append_gradient_clip_ops(params_grads) + params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads) # Add regularization if any params_grads = self.append_regularization_ops( diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py index a2be8260e39b69fe19b8f5932fb00e498459d30b..7b251e8063a05e7d4a09238feaf1efef04739fe4 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py @@ -150,26 +150,29 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) out = func(t) if use_func else paddle.nn.functional.relu(t) - out.stop_gradient = False - dx = paddle.grad( - outputs=[out], inputs=[t], create_graph=True, retain_graph=True + outputs=out, + inputs=t, + grad_outputs=paddle.ones_like(t), + create_graph=True, + retain_graph=True, ) - dx[0].backward() - - assert dx[0].grad is not None - return dx[0].numpy(), dx[0].grad.numpy() + ddout = paddle.grad( + outputs=dx[0], + inputs=out.grad, + grad_outputs=paddle.ones_like(t), + create_graph=False, + ) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + assert ddout[0].numpy() is not None + return dx[0].numpy(), ddout[0].numpy() -class TestNewCustomOpSetUpInstall(unittest.TestCase): + +class TestNewCustomOpXpuSetUpInstall(unittest.TestCase): def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) - # compile, install the custom op egg into site-packages under background - # Currently custom XPU op does not support Windows - if os.name == 'nt': - return cmd = 'cd {} && {} custom_relu_xpu_setup.py install'.format( cur_dir, sys.executable ) @@ -192,7 +195,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): self.custom_op = custom_relu_xpu_module_setup.custom_relu self.dtypes = ['float32', 'float64'] - self.devices = ['xpu'] + self.device = 'xpu' # config seed SEED = 2021 @@ -200,91 +203,90 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): paddle.framework.random._manual_program_seed(SEED) def test_static(self): - for device in self.devices: - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out = custom_relu_static(self.custom_op, device, dtype, x) - pd_out = custom_relu_static( - self.custom_op, device, dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) def test_static_pe(self): - for device in self.devices: - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out = custom_relu_static_pe(self.custom_op, device, dtype, x) - pd_out = custom_relu_static_pe( - self.custom_op, device, dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static_pe(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static_pe( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_allclose( + out, + pd_out, + atol=1e-2, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) def test_dynamic(self): - for device in self.devices: - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, x_grad = custom_relu_dynamic( - self.custom_op, device, dtype, x - ) - pd_out, pd_x_grad = custom_relu_dynamic( - self.custom_op, device, dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) - np.testing.assert_array_equal( - x_grad, - pd_x_grad, - err_msg='custom op x grad: {},\n paddle api x grad: {}'.format( - x_grad, pd_x_grad - ), - ) + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x + ) + pd_out, pd_x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg='custom op x grad: {},\n paddle api x grad: {}'.format( + x_grad, pd_x_grad + ), + ) def test_static_save_and_load_inference_model(self): paddle.enable_static() np_data = np.random.random((1, 1, 28, 28)).astype("float32") np_label = np.random.random((1, 1)).astype("int64") path_prefix = "self.custom_op_inference/custom_relu" - for device in self.devices: - predict = custom_relu_static_inference( - self.custom_op, device, np_data, np_label, path_prefix + + predict = custom_relu_static_inference( + self.custom_op, self.device, np_data, np_label, path_prefix + ) + # load inference model + with static.scope_guard(static.Scope()): + exe = static.Executor() + [ + inference_program, + feed_target_names, + fetch_targets, + ] = static.load_inference_model(path_prefix, exe) + predict_infer = exe.run( + inference_program, + feed={feed_target_names[0]: np_data}, + fetch_list=fetch_targets, + ) + np.testing.assert_allclose( + predict, + predict_infer, + atol=1e-2, + err_msg='custom op predict: {},\n custom op infer predict: {}'.format( + predict, predict_infer + ), ) - # load inference model - with static.scope_guard(static.Scope()): - exe = static.Executor() - [ - inference_program, - feed_target_names, - fetch_targets, - ] = static.load_inference_model(path_prefix, exe) - predict_infer = exe.run( - inference_program, - feed={feed_target_names[0]: np_data}, - fetch_list=fetch_targets, - ) - np.testing.assert_array_equal( - predict, - predict_infer, - err_msg='custom op predict: {},\n custom op infer predict: {}'.format( - predict, predict_infer - ), - ) paddle.disable_static() def test_static_save_and_run_inference_predictor(self): @@ -294,92 +296,97 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): path_prefix = "self.custom_op_inference/custom_relu" from paddle.inference import Config, create_predictor - for device in self.devices: - predict = custom_relu_static_inference( - self.custom_op, device, np_data, np_label, path_prefix - ) - # load inference model - config = Config( - path_prefix + ".pdmodel", path_prefix + ".pdiparams" + predict = custom_relu_static_inference( + self.custom_op, self.device, np_data, np_label, path_prefix + ) + # load inference model + config = Config(path_prefix + ".pdmodel", path_prefix + ".pdiparams") + predictor = create_predictor(config) + input_tensor = predictor.get_input_handle( + predictor.get_input_names()[0] + ) + input_tensor.reshape(np_data.shape) + input_tensor.copy_from_cpu(np_data.copy()) + predictor.run() + output_tensor = predictor.get_output_handle( + predictor.get_output_names()[0] + ) + predict_infer = output_tensor.copy_to_cpu() + predict = np.array(predict).flatten() + predict_infer = np.array(predict_infer).flatten() + np.testing.assert_allclose( + predict, + predict_infer, + rtol=5e-5, + atol=1e-2, + err_msg="custom op predict: {},\n custom op infer predict: {}".format( + predict, predict_infer + ), + ) + paddle.disable_static() + + def test_func_double_grad_dynamic(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x ) - predictor = create_predictor(config) - input_tensor = predictor.get_input_handle( - predictor.get_input_names()[0] + pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x, False ) - input_tensor.reshape(np_data.shape) - input_tensor.copy_from_cpu(np_data.copy()) - predictor.run() - output_tensor = predictor.get_output_handle( - predictor.get_output_names()[0] + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), ) - predict_infer = output_tensor.copy_to_cpu() - self.assertTrue( - np.isclose(predict, predict_infer, rtol=5e-5).any(), - "custom op predict: {},\n custom op infer predict: {}".format( - predict, predict_infer + np.testing.assert_array_equal( + dx_grad, + pd_dx_grad, + err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format( + dx_grad, pd_dx_grad ), ) - paddle.disable_static() - - def test_func_double_grad_dynamic(self): - for device in self.devices: - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, dx_grad = custom_relu_double_grad_dynamic( - self.custom_op, device, dtype, x - ) - pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( - self.custom_op, device, dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) - np.testing.assert_array_equal( - dx_grad, - pd_dx_grad, - err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format( - dx_grad, pd_dx_grad - ), - ) def test_with_dataloader(self): paddle.disable_static() - for device in self.devices: - paddle.set_device(device) - # data loader - transform = Compose( - [Normalize(mean=[127.5], std=[127.5], data_format='CHW')] - ) - train_dataset = paddle.vision.datasets.MNIST( - mode='train', transform=transform - ) - train_loader = paddle.io.DataLoader( - train_dataset, - batch_size=64, - shuffle=True, - drop_last=True, - num_workers=0, - ) + paddle.set_device(self.device) + # data loader + transform = Compose( + [Normalize(mean=[127.5], std=[127.5], data_format='CHW')] + ) + train_dataset = paddle.vision.datasets.MNIST( + mode='train', transform=transform + ) + train_loader = paddle.io.DataLoader( + train_dataset, + batch_size=64, + shuffle=True, + drop_last=True, + num_workers=0, + ) - for batch_id, (image, _) in enumerate(train_loader()): - out = self.custom_op(image) - pd_out = paddle.nn.functional.relu(image) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) + for batch_id, (image, _) in enumerate(train_loader()): + out = self.custom_op(image) + pd_out = paddle.nn.functional.relu(image) + np.testing.assert_allclose( + out, + pd_out, + atol=1e-2, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) - if batch_id == 5: - break + if batch_id == 5: + break paddle.enable_static() if __name__ == '__main__': + # compile, install the custom op egg into site-packages under background + # Currently custom XPU op does not support Windows + if os.name == 'nt': + exit() unittest.main() diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index 1dd6ef6776750c01fa78b6e6a269fea0df63f33d..00eef2d5a77316dcb3918ff32dde55b4fe9a1c73 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -28,4 +28,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120) set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120) + set_tests_properties(test_custom_device_relu_setup PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..da0563ffeb10e3762dc874676ffc9402d0529bc7 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/extension.h" + +#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_CUSTOM_INPUT(x) \ + PD_CHECK(x.is_custom_device(), #x " must be a custom Tensor.") + +template +void relu_cpu_forward_kernel(const data_t* x_data, + data_t* out_data, + int64_t x_numel) { + PD_CHECK(x_data != nullptr, "x_data is nullptr."); + PD_CHECK(out_data != nullptr, "out_data is nullptr."); + for (int64_t i = 0; i < x_numel; ++i) { + out_data[i] = std::max(static_cast(0.), x_data[i]); + } +} + +template +void relu_cpu_backward_kernel(const data_t* grad_out_data, + const data_t* out_data, + data_t* grad_x_data, + int64_t out_numel) { + for (int64_t i = 0; i < out_numel; ++i) { + grad_x_data[i] = + grad_out_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); + } +} + +template +void relu_cpu_double_backward_kernel(const data_t* out_data, + const data_t* ddx_data, + data_t* ddout_data, + int64_t ddout_numel) { + for (int64_t i = 0; i < ddout_numel; ++i) { + ddout_data[i] = + ddx_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); + } +} + +std::vector relu_cpu_forward(const paddle::Tensor& x) { + CHECK_CPU_INPUT(x); + auto out = paddle::empty_like(x); + + PD_DISPATCH_FLOATING_TYPES( + x.type(), "relu_cpu_forward", ([&] { + relu_cpu_forward_kernel( + x.data(), out.data(), x.numel()); + })); + + return {out}; +} + +std::vector relu_cpu_backward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + auto grad_x = paddle::empty_like(x); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { + relu_cpu_backward_kernel( + grad_out.data(), + out.data(), + grad_x.data(), + out.size()); + })); + + return {grad_x}; +} + +std::vector relu_cpu_double_backward( + const paddle::Tensor& out, const paddle::Tensor& ddx) { + CHECK_CPU_INPUT(out); + CHECK_CPU_INPUT(ddx); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] { + relu_cpu_double_backward_kernel( + out.data(), + ddx.data(), + ddout.mutable_data(out.place()), + ddout.size()); + })); + + return {ddout}; +} + +std::vector relu_custom_forward(const paddle::Tensor& x) { + CHECK_CUSTOM_INPUT(x); + auto out = paddle::relu(x); + return {out}; +} + +std::vector relu_custom_backward( + const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + CHECK_CUSTOM_INPUT(x); + CHECK_CUSTOM_INPUT(out); + auto grad_x = paddle::empty_like(x, x.dtype(), x.place()); + auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place()); + auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place()); + auto condition = paddle::experimental::greater_than(x, zeros); + + grad_x = paddle::multiply(grad_out, paddle::where(condition, ones, zeros)); + + return {grad_x}; +} + +std::vector relu_custom_double_backward( + const paddle::Tensor& out, const paddle::Tensor& ddx) { + CHECK_CUSTOM_INPUT(out); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); + auto ones = + paddle::experimental::full_like(out, 1.0, out.dtype(), out.place()); + auto zeros = + paddle::experimental::full_like(out, 0.0, out.dtype(), out.place()); + auto condition = paddle::experimental::greater_than(out, zeros); + + ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros)); + + return {ddout}; +} + +std::vector ReluForward(const paddle::Tensor& x) { + if (x.is_cpu()) { + return relu_cpu_forward(x); + } else if (x.is_custom_device()) { + return relu_custom_forward(x); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector ReluBackward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + if (x.is_cpu()) { + return relu_cpu_backward(x, out, grad_out); + } else if (x.is_custom_device()) { + return relu_custom_backward(x, out, grad_out); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector ReluDoubleBackward(const paddle::Tensor& out, + const paddle::Tensor& ddx) { + if (out.is_cpu()) { + return relu_cpu_double_backward(out, ddx); + } else if (out.is_custom_device()) { + return relu_custom_double_backward(out, ddx); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector> ReluDoubleBackwardInferShape( + const std::vector& out_shape, + const std::vector& ddx_shape) { + return {out_shape}; +} + +PD_BUILD_OP(custom_relu) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ReluForward)); + +PD_BUILD_GRAD_OP(custom_relu) + .Inputs({"X", "Out", paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(ReluBackward)); + +PD_BUILD_DOUBLE_GRAD_OP(custom_relu) + .Inputs({"Out", paddle::Grad(paddle::Grad("X"))}) + .Outputs({paddle::Grad(paddle::Grad("Out"))}) + .SetKernelFn(PD_KERNEL(ReluDoubleBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape)); diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..760ad56cc3380e4d5b53fd65e07638e14d5859f5 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py @@ -0,0 +1,325 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +import unittest +from site import getsitepackages + +import numpy as np + + +def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): + import paddle + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + paddle.set_device(device) + + t = paddle.to_tensor(np_x, dtype=dtype) + t.stop_gradient = False + sys.stdout.flush() + + out = func(t) if use_func else paddle.nn.functional.relu(t) + out.stop_gradient = False + + out.backward() + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + if t.grad is None: + return out.numpy(), t.grad + else: + return out.numpy(), t.grad.numpy() + + +def custom_relu_static(func, device, dtype, np_x, use_func=True): + import paddle + import paddle.static as static + + paddle.enable_static() + paddle.set_device(device) + + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name="X", shape=[None, 8], dtype=dtype) + x.stop_gradient = False + out = func(x) if use_func else paddle.nn.functional.relu(x) + static.append_backward(out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + # in static mode, x data has been covered by out + out_v = exe.run( + static.default_main_program(), + feed={"X": np_x}, + fetch_list=[out.name], + ) + + paddle.disable_static() + return out_v + + +def custom_relu_static_pe(func, device, dtype, np_x, use_func=True): + import paddle + import paddle.static as static + + paddle.enable_static() + paddle.set_device(device) + + places = paddle.CustomPlace("custom_cpu", 0) + + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name="X", shape=[None, 8], dtype=dtype) + x.stop_gradient = False + out = func(x) if use_func else paddle.nn.functional.relu(x) + static.append_backward(out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + + # in static mode, x data has been covered by out + compiled_prog = static.CompiledProgram( + static.default_main_program() + ).with_data_parallel(loss_name=out.name, places=places) + out_v = exe.run( + compiled_prog, feed={"X": np_x}, fetch_list=[out.name] + ) + + paddle.disable_static() + return out_v + + +def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): + import paddle + + paddle.set_device(device) + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + + t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + + out = func(t) if use_func else paddle.nn.functional.relu(t) + dx = paddle.grad( + outputs=out, + inputs=t, + grad_outputs=paddle.ones_like(t), + create_graph=True, + retain_graph=True, + ) + + ddout = paddle.grad( + outputs=dx[0], + inputs=out.grad, + grad_outputs=paddle.ones_like(t), + create_graph=False, + ) + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + assert ddout[0].numpy() is not None + return dx[0].numpy(), ddout[0].numpy() + + +class TestNewCustomOpSetUpInstall(unittest.TestCase): + def setUp(self): + # compile so and set to current path + self.cur_dir = os.path.dirname(os.path.abspath(__file__)) + self.temp_dir = tempfile.TemporaryDirectory() + cmd = 'cd {} \ + && git clone {} \ + && cd PaddleCustomDevice \ + && git fetch origin \ + && git checkout {} -b dev \ + && cd backends/custom_cpu \ + && mkdir build && cd build && cmake .. && make -j8 \ + && cd {}'.format( + self.temp_dir.name, + os.getenv('PLUGIN_URL'), + os.getenv('PLUGIN_TAG'), + self.cur_dir, + ) + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join( + self.cur_dir, + '{}/PaddleCustomDevice/backends/custom_cpu/build'.format( + self.temp_dir.name + ), + ) + + # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice + import paddle + + # [Why specific paddle_includes directory?] + # Add paddle_includes to pass CI, for more details, + # please refer to the comments in `paddle/fluid/tests/custom_op/utils.py`` + paddle_includes = [] + for site_packages_path in getsitepackages(): + paddle_includes.append( + os.path.join(site_packages_path, 'paddle', 'include') + ) + paddle_includes.append( + os.path.join( + site_packages_path, 'paddle', 'include', 'third_party' + ) + ) + + custom_module = paddle.utils.cpp_extension.load( + name='custom_device_relu', + sources=['custom_relu_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=["-w", "-g"], # test for cc flags + # build_directory=self.cur_dir, + verbose=True, + ) + self.custom_op = custom_module.custom_relu + + self.dtypes = ["float32", "float64"] + self.device = "custom_cpu" + + # config seed + SEED = 2021 + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + def tearDown(self): + self.temp_dir.cleanup() + del os.environ['CUSTOM_DEVICE_ROOT'] + + def test_custom_device(self): + self._test_static() + self._test_static_pe() + self._test_dynamic() + self._test_double_grad_dynamic() + self._test_with_dataloader() + + def _test_static(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + def _test_static_pe(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static_pe(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static_pe( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + def _test_dynamic(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x + ) + pd_out, pd_x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg="custom op x grad: {},\n paddle api x grad: {}".format( + x_grad, pd_x_grad + ), + ) + + def _test_double_grad_dynamic(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x + ) + pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + dx_grad, + pd_dx_grad, + err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format( + dx_grad, pd_dx_grad + ), + ) + + def _test_with_dataloader(self): + import paddle + from paddle.vision.transforms import Compose, Normalize + + paddle.set_device(self.device) + # data loader + transform = Compose( + [Normalize(mean=[127.5], std=[127.5], data_format="CHW")] + ) + train_dataset = paddle.vision.datasets.MNIST( + mode="train", transform=transform + ) + train_loader = paddle.io.DataLoader( + train_dataset, + batch_size=64, + shuffle=True, + drop_last=True, + num_workers=0, + ) + + for batch_id, (image, _) in enumerate(train_loader()): + out = self.custom_op(image) + pd_out = paddle.nn.functional.relu(image) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + if batch_id == 5: + break + + +if __name__ == "__main__": + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py index dcfe477a76b3e24ec8df2e02d3fe07121f16d9cf..65483d1c6adf68dba55e43180e9993d712193811 100644 --- a/python/paddle/fluid/tests/test_error_clip.py +++ b/python/paddle/fluid/tests/test_error_clip.py @@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog): prog_clip = prog.clone() prog_clip.block(0).var(hidden1.name)._set_error_clip( - fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN) + paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN) ) avg_cost_clip = prog_clip.block(0).var(avg_cost.name) fluid.backward.append_backward(loss=avg_cost) fluid.backward.append_backward( - loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback] + loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback] ) hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD") diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py index f974709ce87abec8679b3846746bbe087e495778..f97faed1d584fce94d8715323e525fea7ac57d49 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py @@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase): opt = paddle.optimizer.AdamW( learning_rate=lr_val, - grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), ) acc_steps = 2 # accumulated steps for pipeline diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py index ecc71abe6252cd864f997a0059837efc73a66990..170243fc962839f063a0aafc39adef62fc0d4737 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py @@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase): opt = fluid.optimizer.Momentum( learning_rate=lr_val, momentum=0.9, - grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), ) acc_steps = 2 # accumulated steps for pipeline diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py index c7b44fe305d25aa2cac4fd5f4f8ffda56b479940..0d499393f12155aa1d0b73af9f45e2f98a0d2f56 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py @@ -15,10 +15,10 @@ import unittest import paddle -import paddle.fluid.clip as clip import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer import paddle.fluid.regularizer as regularizer +import paddle.nn.clip as clip paddle.enable_static() @@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase): rampup_begin_step=0, num_trainers=2, regularization=regularization, - grad_clip=clip.GradientClipByNorm(1.0), + grad_clip=clip.ClipGradByNorm(1.0), ) if use_recompute: @@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase): print("dgc regular_coeff=" + str(coeff)) def test_tpyeError(self): - # the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm' + # the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm' with self.assertRaises(TypeError): dgc_momentum_optimizer = self.MockDGCMomentum( learning_rate=0.01, momentum=0.2, rampup_begin_step=0, num_trainers=2, - grad_clip=clip.GradientClipByGlobalNorm(1.0), + grad_clip=clip.ClipGradByGlobalNorm(1.0), ) def test_momentum_without_dgc(self): diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py index eee1235670805f8d66b8206bbdd954129adfba97..0982ab86117c9f1302bb604737ec143902963725 100755 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py @@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 - clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) + clip = paddle.nn.ClipGradByGlobalNorm(1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip @@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_merge = True - clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) + clip = paddle.nn.ClipGradByGlobalNorm(1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip @@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 - clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) + clip = paddle.nn.ClipGradByGlobalNorm(1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip @@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 - clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) + clip = paddle.nn.ClipGradByGlobalNorm(1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py index d59c074c03f11dd5ce9acc635216a417e7437f07..46b5fe9ed4b6a641d21d42a0cf1d730314f5a964 100755 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py @@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ) avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'sharding') - clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip ) @@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): "micro_batch_size": 2, "accumulate_steps": 4, } - clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip ) @@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): "micro_batch_size": 2, "accumulate_steps": 4, } - clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) self.optimizer( avg_cost, strategy, diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py index 130510d90cb045fb77c11f703200df762a9232c9..3be3cfecf16d6ef3e19ef989b1065d592529eb90 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py @@ -488,6 +488,9 @@ class TestProcessGroupFp32(unittest.TestCase): task.wait() print("test reduce prod api ok") + + test_reduce_with_zero_dim([], self.dtype, pg) + # test Scatter # rank 0 in_shape = list(self.shape) @@ -601,5 +604,88 @@ class TestProcessGroupFp16(TestProcessGroupFp32): self.shape = (4, 20, 20) +def test_reduce_with_zero_dim(shape, dtype, pg): + # test Reduce With Zero Dim + # rank 0 + x = np.random.random(shape).astype(dtype) + y = np.random.random(shape).astype(dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, sync_op=True) + paddle.device.cuda.synchronize() + # rank 1 + else: + task = dist.reduce(tensor_y, 0, sync_op=False) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) and len(tensor_x.shape) == 0 + print("test reduce with zero dim sum api ok\n") + + # test reduce with zero dim max + # rank 0 + x = np.random.random(shape).astype(dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(shape).astype(dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.MAX, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, max_result) and len(tensor_x.shape) == 0 + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.MAX, sync_op=False) + task.wait() + + print("test reduce with zero dim max api ok") + + # test reduce with zero dim min + # rank 0 + x = np.random.random(shape).astype(dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(shape).astype(dtype) + tensor_y = paddle.to_tensor(y) + + min_result = paddle.minimum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.MIN, sync_op=False) + task.wait() + assert np.array_equal(tensor_x, min_result) and len(tensor_x.shape) == 0 + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.MIN, sync_op=False) + task.wait() + + print("test reduce with zero dim min api ok") + + # test reduce with zero dim product + # rank 0 + x = np.random.random(shape).astype(dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(shape).astype(dtype) + tensor_y = paddle.to_tensor(y) + + prod_result = np.multiply(x, y) + + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, dist.ReduceOp.PROD, sync_op=False) + task.wait() + assert ( + np.array_equal(tensor_x, prod_result) and len(tensor_x.shape) == 0 + ) + else: + task = dist.reduce(tensor_y, 0, dist.ReduceOp.PROD, sync_op=False) + task.wait() + + print("test reduce with zero dim prod api ok") + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py index ff9122b1191b64e36ddb40c93f9770d0d5135646..3fa9c12529272c495644508e947d63c6a3f973b2 100644 --- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py @@ -22,8 +22,8 @@ import paddle import paddle.distributed.fleet as fleet import paddle.fluid.core as core from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper -from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type from paddle.incubate import DistributedFusedLamb +from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type from paddle.vision.models import resnet18 as resnet diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py index de3508afcbe2bca43c4a5e762f4519a7e2e4c714..218e3ed4326ad5c0e9282b4dc0026464304ab363 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py @@ -19,6 +19,7 @@ import numpy as np import paddle import paddle.fluid as fluid from paddle.jit.dy2static import Call +from paddle.nn import clip SEED = 2020 np.random.seed(SEED) @@ -89,11 +90,11 @@ def len_with_selected_rows(place): type=fluid.core.VarDesc.VarType.SELECTED_ROWS, ) # y is Variable(SelectedRows) - y = fluid.layers.merge_selected_rows(var) + y = clip.merge_selected_rows(var) y_len = Call(len)(y) # z is inner tensor with shape [4, 2] - z = fluid.layers.get_tensor_from_selected_rows(y) + z = clip.get_tensor_from_selected_rows(y) z_len = Call(len)(z) # set data for selected_rows diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py index fabfa8edc3c83aa85a2d9c60bcc6801b5c9a39bd..5c84da8e621be91b434f2926b236e17363f00b30 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py @@ -441,5 +441,39 @@ class TestErrorInForLoop(TestTransformForLoop): self.dyfunc = for_loop_dyfunc_not_support +class Net(paddle.nn.Layer): + def __init__(self): + super().__init__() + + self.layer_dict = paddle.nn.LayerDict( + { + "conv1": paddle.nn.Conv2D(3, 3, 1), + "conv2": paddle.nn.Conv2D(3, 3, 1), + "conv3": paddle.nn.Conv2D(3, 3, 1), + } + ) + + def forward(self, x): + out = 0 + for layer_name in self.layer_dict: + out += self.layer_dict[layer_name](x) + return out + + +class TestForLoopMeetDict(unittest.TestCase): + def test_start(self): + + net = Net() + model = paddle.jit.to_static( + net, + input_spec=[ + paddle.static.InputSpec( + shape=[None, 3, 224, 224], dtype='float32' + ) + ], + ) + paddle.jit.save(model, "./inference/inference") + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py index a0a45ddbde2bea07d67b7d40299e695b7ce11ff5..64d0d816ba0a5bf0a2e54d5096aeafb2f900f999 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py @@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter import paddle.fluid as fluid -from paddle.fluid.clip import GradientClipByGlobalNorm from paddle.jit import ProgramTranslator +from paddle.nn import ClipGradByGlobalNorm place = ( fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace() @@ -71,7 +71,7 @@ def train(args, attn_model=False): dropout=args.dropout, ) - gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm) + gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm) optimizer = fluid.optimizer.SGD( args.learning_rate, parameter_list=model.parameters(), diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 05d26dde6eddfafa703268cd8904a857487cd71d..cfc83bbcb52047bad575bd0a9911f274d68cadb2 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -94,6 +94,19 @@ if(WITH_MKLDNN) endforeach() endif() +# below are cutlass unitests +file( + GLOB TEST_CUTLASS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "test_cutlass_*.py") +string(REPLACE ".py" "" TEST_CUTLASS "${TEST_CUTLASS}") +list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_CUTLASS}) +if(WITH_CUTLASS) + foreach(target ${TEST_CUTLASS}) + py_test_modules(${target} MODULES ${target}) + endforeach() +endif() + if(WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py index b561822f1af92f652f7c8a9851b2d2eee34330df..99450cae46f516ef5af647b667b77789cabd899d 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -74,6 +74,8 @@ class IgnoreReasons(enum.Enum): PASS_ACCURACY_ERROR = 2 # Accuracy is abnormal after enabling mkldnn. MKLDNN_ACCURACY_ERROR = 3 + # Accuracy is abnormal after enabling cutlass. + CUTLASS_ACCURACY_ERROR = 3 # TODO(wilber): just for backward compatible @@ -877,3 +879,96 @@ class TrtLayerAutoScanTest(AutoScanTest): note: str, ): self.ignore_cases.append((teller, reason, note)) + + +class CutlassAutoScanTest(AutoScanTest): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run_test(self, quant=False, *args, **kwargs): + status = True + + for prog_config in self.sample_program_configs(*args, **kwargs): + # if program is invalid, we should skip that cases. + if not self.is_program_valid(prog_config): + continue + + model, params = create_fake_model(prog_config) + feed_data = {} + for name, tensor_config in prog_config.inputs.items(): + feed_data[name] = { + 'data': tensor_config.data, + 'lod': tensor_config.lod, + } + results: List[Dict[str, np.ndarray]] = [] + + # baseline: gpu no ir_optim run + base_config = self.create_inference_config( + ir_optim=False, use_gpu=True + ) + logging.info('RUN program_config: ' + str(prog_config)) + results.append( + self.run_test_config( + model, params, prog_config, base_config, feed_data + ) + ) + self.success_log('RUN_GPU_BASELINE done') + + for pred_config, (atol, rtol) in self.sample_predictor_configs( + prog_config + ): + # skip info + ignore_flag = False + for ignore_info in self.ignore_cases: + if ignore_info[0](prog_config, pred_config): + ignore_flag = True + if ( + ignore_info[1] + == IgnoreReasons.CUTLASS_ACCURACY_ERROR + ): + self.ignore_log( + "[CUTLASS_ACCURACY_ERROR] " + + ignore_info[2] + + ' ' + + ' vs ' + + self.inference_config_str(pred_config) + ) + else: + raise NotImplementedError + break + + if os.path.exists(self.cache_dir): + shutil.rmtree(self.cache_dir) + if not os.path.exists(self.cache_dir): + os.mkdir(self.cache_dir) + + try: + results.append( + self.run_test_config( + model, params, prog_config, pred_config, feed_data + ) + ) + self.assert_tensors_near( + atol, rtol, results[-1], results[0] + ) + except Exception as e: + self.fail_log( + self.inference_config_str(pred_config) + + '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)) + ) + if not ignore_flag: + status = False + continue + self.success_log( + 'RUN predictor_config ' + + self.inference_config_str(pred_config) + + ' done' + ) + + self.assertTrue(status) + + def inference_config_str(self, config) -> str: + dic = {} + enable_gpu = config.use_gpu() + dic['use_gpu'] = enable_gpu + return str(dic) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py new file mode 100644 index 0000000000000000000000000000000000000000..8adeff0f73ddf96ee78ff3d0631547e7259491c8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py @@ -0,0 +1,306 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial +from itertools import product + +import numpy as np +from auto_scan_test import CutlassAutoScanTest +from program_config import ProgramConfig, TensorConfig + +import paddle.inference as paddle_infer + + +# cba pattern +class TestCutlassConv2dFusionOp1(CutlassAutoScanTest): + def sample_program_configs(self, *args, **kwargs): + def generate_input1(input_shape): + return np.random.random(input_shape).astype(np.float32) + + def generate_weight(weight_shape): + return np.random.random(weight_shape).astype(np.float32) + + def generate_bias(bias_shape): + return np.random.random(bias_shape).astype(np.float32) + + input_shape_options = [[1, 16, 112, 112], [1, 8, 64, 64]] + weight_shape_options = [[24, -1, 3, 3]] + strides_options = [[1, 1], [2, 2]] + paddings_options = [[1, 1], [1, 0, 1, 2]] + groups_options = [1] + padding_algorithm_options = ['EXPLICIT'] + dilations_options = [[2, 2], [1, 1]] + data_format_options = ['NCHW'] + act_options = ['relu', 'leaky_relu', 'swish'] + + configurations = [ + input_shape_options, + weight_shape_options, + strides_options, + paddings_options, + groups_options, + padding_algorithm_options, + dilations_options, + data_format_options, + act_options, + ] + + for ( + input_shape, + weight_shape, + strides, + paddings, + groups, + padding_algorithm, + dilations, + data_format, + act, + ) in product(*configurations): + + weight_shape[1] = input_shape[1] + attrs = [ + { + "strides": strides, + "paddings": paddings, + "groups": groups, + "padding_algorithm": padding_algorithm, + "dilations": dilations, + "data_format": data_format, + }, + {"axis": 1}, + ] + + ops_config = [ + { + "op_type": "conv2d", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"], + }, + "op_outputs": {"Output": ["conv_output_data"]}, + "op_attrs": attrs[0], + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["conv_output_data"], + "Y": ["elementwise_weight"], + }, + "op_outputs": {"Out": ["output_data0"]}, + "op_attrs": attrs[1], + }, + { + "op_type": act, + "op_inputs": {"X": ["output_data0"]}, + "op_outputs": {"Out": ["output_data1"]}, + "op_attrs": {}, + }, + ] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": TensorConfig( + data_gen=partial(generate_weight, weight_shape) + ), + "elementwise_weight": TensorConfig( + data_gen=partial(generate_bias, [weight_shape[0]]) + ), + }, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, input_shape) + ) + }, + outputs=["output_data1"], + ) + + yield program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_gpu=True) + config.enable_use_gpu(256, 0, paddle_infer.PrecisionType.Half) + config.exp_enable_use_cutlass() + yield config, (1e-2, 1e-2) + + def test(self, *args, **kwargs): + self.run_test(quant=False, *args, **kwargs) + + +# cbaa pattern +class TestCutlassConv2dFusionOp2(CutlassAutoScanTest): + def sample_program_configs(self, *args, **kwargs): + def generate_input(input_shape): + return (np.random.random(input_shape) * 2 - 1).astype(np.float32) + + def generate_weight(weight_shape): + return (np.random.random(weight_shape) * 2 - 1).astype(np.float32) + + def generate_bias(bias_shape): + return np.random.random(bias_shape).astype(np.float32) + + input_shape_options = [[1, 16, 112, 112], [1, 24, 64, 64]] + weight_shape_options = [[24, -1, 3, 3]] + strides_options = [[2, 2], [1, 1]] + paddings_options = [[1, 1]] + groups_options = [1] + padding_algorithm_options = ['EXPLICIT'] + dilations_options = [[1, 1]] + data_format_options = ['NCHW'] + act_options = ['relu'] + + configurations = [ + input_shape_options, + weight_shape_options, + strides_options, + paddings_options, + groups_options, + padding_algorithm_options, + dilations_options, + data_format_options, + act_options, + ] + + for ( + input_shape, + weight_shape, + strides, + paddings, + groups, + padding_algorithm, + dilations, + data_format, + act, + ) in product(*configurations): + weight_shape[1] = input_shape[1] + residual_shape = list(input_shape) + residual_shape[1] = weight_shape[0] + + ih = input_shape[2] + iw = input_shape[3] + pad_h0 = 0 + pad_h1 = 0 + pad_w0 = 0 + pad_w1 = 0 + if len(paddings) == 2: + pad_h0 = paddings[0] + pad_h1 = paddings[0] + pad_w0 = paddings[1] + pad_w1 = paddings[1] + elif len(paddings) == 4: + pad_h0 = paddings[0] + pad_h1 = paddings[1] + pad_w0 = paddings[2] + pad_w1 = paddings[3] + dilation_h = dilations[0] + dilation_w = dilations[1] + kh = weight_shape[2] + kw = weight_shape[3] + stride_h = strides[0] + stride_w = strides[1] + residual_shape[2] = (int)( + (ih + pad_h0 + pad_h1 - dilation_h * (kh - 1) - 1) / stride_h + ) + 1 + residual_shape[3] = (int)( + (iw + pad_w0 + pad_w1 - dilation_w * (kw - 1) - 1) / stride_w + ) + 1 + + attrs = [ + { + "strides": strides, + "paddings": paddings, + "groups": groups, + "padding_algorithm": padding_algorithm, + "dilations": dilations, + "data_format": data_format, + }, + {"axis": 1}, + ] + + ops_config = [ + { + "op_type": "conv2d", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"], + }, + "op_outputs": {"Output": ["conv_output_data"]}, + "op_attrs": attrs[0], + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["conv_output_data"], + "Y": ["elementwise_weight"], + }, + "op_outputs": {"Out": ["output_data0"]}, + "op_attrs": attrs[1], + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["residual_data"], + "Y": ["output_data0"], + }, + "op_outputs": {"Out": ["output_data1"]}, + "op_attrs": {}, + }, + { + "op_type": act, + "op_inputs": {"X": ["output_data1"]}, + "op_outputs": {"Out": ["output_data2"]}, + "op_attrs": {}, + }, + ] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": TensorConfig( + data_gen=partial(generate_weight, weight_shape) + ), + "elementwise_weight": TensorConfig( + data_gen=partial(generate_bias, [weight_shape[0]]) + ), + }, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input, input_shape) + ), + "residual_data": TensorConfig( + data_gen=partial(generate_input, residual_shape) + ), + }, + outputs=["output_data2"], + ) + + yield program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_gpu=True) + config.enable_use_gpu(256, 0, paddle_infer.PrecisionType.Half) + config.exp_enable_use_cutlass() + yield config, (1e-2, 1e-2) + + def test(self, *args, **kwargs): + self.run_test(quant=False, *args, **kwargs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py index b1890ea95ab9782187c66d58027422e7481b0602..24a63751cfec431d4335baa793543da3ba48d83d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip( ): def set_params(self): self.operand = paddle.add - self.act = fluid.layers.clip + self.act = paddle.clip self.act_alpha = 0.0 self.act_beta = 10.0 @@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip( ): def set_params(self): self.operand = paddle.subtract - self.act = fluid.layers.clip + self.act = paddle.clip self.act_alpha = 0.0 self.act_beta = 10.0 @@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip( ): def set_params(self): self.operand = paddle.multiply - self.act = fluid.layers.clip + self.act = paddle.clip self.act_alpha = 0.0 self.act_beta = 10.0 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py index d696fa44f5aaa081b0587c2049c68c90d55637d3..5c1a11625611ba67b82c3d462dcc87d1d0998708 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py @@ -37,74 +37,55 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest): input_dim = draw(st.sampled_from([1, 32, 64])) def generate_input(attrs, type): - if attrs[1]['transpose_X'] and attrs[1]['transpose_Y']: - shape_x = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - attrs[2]['input_dim'], - 32, - ] - shape_y = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - 64, - attrs[2]['input_dim'], - ] - elif attrs[1]['transpose_X']: - shape_x = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - attrs[2]['input_dim'], - 32, - ] - shape_y = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - attrs[2]['input_dim'], - 64, - ] - elif attrs[1]['transpose_Y']: - shape_x = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - 32, - attrs[2]['input_dim'], - ] - shape_y = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - 8, - attrs[2]['input_dim'], - ] + is_transpose_X = attrs[1]['transpose_X'] + is_transpose_Y = attrs[1]['transpose_Y'] + + if is_transpose_X: + shape_x_3 = attrs[2]['input_dim'] + shape_x_4 = 32 else: - shape_x = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - 32, - attrs[2]['input_dim'], - ] - shape_y = [ - attrs[2]['batch_size'], - attrs[2]['channel'], - attrs[2]['input_dim'], - 16, - ] - - if type == "x": - return np.random.random(shape_x).astype(np.float32) + shape_x_3 = 32 + shape_x_4 = attrs[2]['input_dim'] + + if is_transpose_X and is_transpose_Y: + shape_y_3 = 64 + shape_y_4 = attrs[2]['input_dim'] + elif is_transpose_X: + shape_y_3 = attrs[2]['input_dim'] + shape_y_4 = 64 + elif is_transpose_Y: + shape_y_3 = 8 + shape_y_4 = attrs[2]['input_dim'] else: - return np.random.random(shape_y).astype(np.float32) + shape_y_3 = attrs[2]['input_dim'] + shape_y_4 = 16 + + shape_x = [ + attrs[2]['batch_size'], + attrs[2]['channel'], + shape_x_3, + shape_x_4, + ] + shape_y = [ + attrs[2]['batch_size'], + attrs[2]['channel'], + shape_y_3, + shape_y_4, + ] + + shape = shape_x if type == 'x' else shape_y + return np.random.random(shape).astype(np.float32) attrs = [ { - "scale": scale, - "bias": bias, - "bias_after_scale": bias_after_scale, + 'scale': scale, + 'bias': bias, + 'bias_after_scale': bias_after_scale, }, { - "transpose_X": transpose_X, - "transpose_Y": transpose_Y, - "alpha": alpha, + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + 'alpha': alpha, }, { 'batch_size': batch_size, @@ -115,29 +96,29 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest): ops_config = [ { - "op_type": "scale", - "op_inputs": {"X": ["input_data1"]}, - "op_outputs": {"Out": ["scale_output"]}, - "op_attrs": { - "scale": attrs[0]['scale'], - "bias": attrs[0]['bias'], - "bias_after_scale": attrs[0]['bias_after_scale'], + 'op_type': 'scale', + 'op_inputs': {'X': ['input_data1']}, + 'op_outputs': {'Out': ['scale_output']}, + 'op_attrs': { + 'scale': attrs[0]['scale'], + 'bias': attrs[0]['bias'], + 'bias_after_scale': attrs[0]['bias_after_scale'], }, }, { - "op_type": "matmul", - "op_inputs": {"X": ["scale_output"], "Y": ["input_data2"]}, - "op_outputs": {"Out": ["matmul_output"]}, - "op_attrs": { + 'op_type': 'matmul', + 'op_inputs': {'X': ['scale_output'], 'Y': ['input_data2']}, + 'op_outputs': {'Out': ['matmul_output']}, + 'op_attrs': { 'transpose_X': attrs[1]['transpose_X'], 'transpose_Y': attrs[1]['transpose_Y'], 'alpha': attrs[1]['alpha'], - "fused_reshape_X": [], - "fused_reshape_Y": [], - "fused_transpose_X": [], - "fused_transpose_Y": [], - "fused_reshape_Out": [], - "fused_transpose_Out": [], + 'fused_reshape_X': [], + 'fused_reshape_Y': [], + 'fused_transpose_X': [], + 'fused_transpose_Y': [], + 'fused_reshape_Out': [], + 'fused_transpose_Out': [], }, }, ] @@ -148,25 +129,27 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest): ops=ops, weights={}, inputs={ - "input_data1": TensorConfig( - data_gen=partial(generate_input, attrs, "x") + 'input_data1': TensorConfig( + data_gen=partial(generate_input, attrs, 'x') ), - "input_data2": TensorConfig( - data_gen=partial(generate_input, attrs, "y") + 'input_data2': TensorConfig( + data_gen=partial(generate_input, attrs, 'y') ), }, - outputs=["matmul_output"], + outputs=['matmul_output'], ) return program_config def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_mkldnn=True) + config = self.create_inference_config( + use_mkldnn=True, passes=['scale_matmul_fuse_pass'] + ) yield config, ['matmul'], (1e-5, 1e-5) def test(self): - self.run_and_statis(quant=False, passes=["scale_matmul_fuse_pass"]) + self.run_and_statis(quant=False, passes=['scale_matmul_fuse_pass']) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 3a39c84141ced2c0f0538350b3d70c7d9bcaf9c3..0c205fbee7c87079035221e457663c24b0234ced 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest( class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.clip(x, 0, 1) + return paddle.clip(x, 0, 1) class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py index 35780b491cc7633e84427b457456202f1bc245e4..ce46c79cbbd3dd5ee600c47874f5d1e1c3ee1bee 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py @@ -39,45 +39,46 @@ class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest): def generate_input(shape): return np.random.random(shape).astype(np.float32) - for batch in [1, 2, 4]: - for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]: - for axis in [-1 if len(shape) == 1 else 1]: - self.dims = len(shape) - dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}] - ops_config = [ - { - "op_type": "equal", - "op_inputs": { - "X": ["input_data1"], - "Y": ["input_data2"], + for op_type in ["equal", "not_equal"]: + for batch in [1, 2, 4]: + for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]: + for axis in [-1 if len(shape) == 1 else 1]: + self.dims = len(shape) + dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}] + ops_config = [ + { + "op_type": op_type, + "op_inputs": { + "X": ["input_data1"], + "Y": ["input_data2"], + }, + "op_outputs": {"Out": ["compare_output_data"]}, + "op_attrs": dics[0], }, - "op_outputs": {"Out": ["compare_output_data"]}, - "op_attrs": dics[0], - }, - { - "op_type": "cast", - "op_inputs": {"X": ["compare_output_data"]}, - "op_outputs": {"Out": ["output_data"]}, - "op_attrs": dics[1], - }, - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data1": TensorConfig( - data_gen=partial(generate_input, shape) - ), - "input_data2": TensorConfig( - data_gen=partial(generate_input, shape) - ), - }, - outputs=["output_data"], - ) - - yield program_config + { + "op_type": "cast", + "op_inputs": {"X": ["compare_output_data"]}, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[1], + }, + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data1": TensorConfig( + data_gen=partial(generate_input, shape) + ), + "input_data2": TensorConfig( + data_gen=partial(generate_input, shape) + ), + }, + outputs=["output_data"], + ) + + yield program_config def sample_predictor_configs( self, program_config diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py index 8b13546d9a2852009dfba4744b5bdfaaac07d3d0..122429a7f8454cd687b90b3e503b531727d478f0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py @@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase): input_data = np.random.random((2, 4)).astype("float32") def test_Variable(): - fluid.layers.clip(x=input_data, min=-1.0, max=1.0) + paddle.clip(x=input_data, min=-1.0, max=1.0) self.assertRaises(TypeError, test_Variable) def test_dtype(): x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') - fluid.layers.clip(x=x2, min=-1.0, max=1.0) + paddle.clip(x=x2, min=-1.0, max=1.0) self.assertRaises(TypeError, test_dtype) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 912f52969d712e1a03da97b9a9d119ab99161b22..d0e6c98e25a422c8eeeccb1feb1544b144152316 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase): value = np.arange(26).reshape(2, 13).astype("float32") a = fluid.dygraph.to_variable(value) linear = paddle.nn.Linear(13, 5) - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) adam = paddle.optimizer.Adam( 0.1, parameters=linear.parameters(), grad_clip=clip ) diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index a9d79f81bf310b9a1d94202c655571c948857909..ce3dd7509ce1d8cfddfc06af95a7f2d2358c8b5c 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -20,12 +20,13 @@ from op_test import OpTest import paddle import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.nn import clip class TestClipByNormOp(OpTest): def setUp(self): self.max_relative_error = 0.006 - self.python_api = fluid.layers.clip_by_norm + self.python_api = clip.clip_by_norm self.init_dtype() self.initTestCase() input = np.random.random(self.shape).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py index a25edccb97a4edca00c4f24e4cd020c11062c449..359220a7a601f131f89e68c6da8b424d20070c3d 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_op.py @@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase): input_data = np.random.random((2, 4)).astype("float32") def test_Variable(): - fluid.layers.clip(x=input_data, min=-1.0, max=1.0) + paddle.clip(x=input_data, min=-1.0, max=1.0) self.assertRaises(TypeError, test_Variable) - - def test_dtype(): - x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') - fluid.layers.clip(x=x2, min=-1.0, max=1.0) - - self.assertRaises(TypeError, test_dtype) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index d5ad18fc434cbe9075604b9bef0798afeaa0c8a6..c6bdd59d496634744da2673d7f2ca8b103346376 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest): def filter(param): return param.name == "fc_w" - clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter) + clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter) sgd_optimizer.minimize(avg_cost, grad_clip=clip) def transpiler_test_impl(self): diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index d0256b5dfb8994c3ee27fb0c2c29ab3bd136d4ac..80bc977f091bac9e57c5e4774e5236a96115c22c 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase): self.feed_order, ) = res_vars - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm( + paddle.nn.clip.set_gradient_clip( + clip=paddle.nn.ClipGradByGlobalNorm( clip_norm=config.max_grad_norm ) ) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py index e81fc34ea2ca0fb8eea864a51791bf7b13a5abc0..400009f820de3c59cafb87582ca43c77dc7ae176 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py @@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase): ) opt = paddle.optimizer.AdamW( learning_rate=lr_val, - grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), ) opt.minimize(loss) # TODO: section_program will be removed in the future diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py index 0de28e9839efa344244eeba0e60ad93afdca0291..d24348b7d77b58234f2dbc1ef9d7ae7d563a19d3 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py @@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase): ) opt = paddle.optimizer.AdamW( learning_rate=lr_val, - grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), ) opt.minimize(loss) # TODO: section_program will be removed in the future diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py index 36a85e2d74fc7af46061dc3ccef0e1255cdaa056..46eb0dc6f0bf8428ca0b5b6989fb6444ca5b2495 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py @@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase): ) opt = paddle.optimizer.AdamW( learning_rate=lr_val, - grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), ) opt.minimize(loss) # TODO: section_program will be removed in the future diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py index a1e144da146a57ae277d40b381682bc458cffab7..1d4e079f9f84ad71bb366b7d11516570c9832a98 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py @@ -70,5 +70,17 @@ class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError): self.data_format = "NCL" +class TestFunctionalConv1DErrorCase2(TestFunctionalConv1DError): + def setUp(self): + self.input = np.random.randn(1, 3, 3) + self.filter = np.random.randn(3) + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCL" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py index ab5b9096dcc8ad1c3df4485e3c805abbb3a05eec..d1e3e6df335b002a64d4dc33e5de001dab8c5546 100644 --- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py @@ -20,6 +20,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid import Program, program_guard from paddle.fluid.op import Operator +from paddle.nn import clip class TestGetTensorFromSelectedRowsError(unittest.TestCase): @@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase): x_data = np.random.random((2, 4)).astype("float32") def test_Variable(): - fluid.layers.get_tensor_from_selected_rows(x=x_data) + clip.get_tensor_from_selected_rows(x=x_data) self.assertRaises(TypeError, test_Variable) def test_SELECTED_ROWS(): - fluid.layers.get_tensor_from_selected_rows(x=x_var) + clip.get_tensor_from_selected_rows(x=x_var) self.assertRaises(TypeError, test_SELECTED_ROWS) diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py index db34123d3bdd8f3faf27f5a8ba51ddb881fcbe87..4cb4b5d773b48ded81187c29993ec9912cb56457 100644 --- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py +++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py @@ -17,12 +17,8 @@ import unittest import numpy as np import paddle.fluid as fluid -from paddle.fluid.clip import ( - GradientClipByGlobalNorm, - GradientClipByNorm, - GradientClipByValue, -) from paddle.fluid.dygraph.base import to_variable +from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue class TestGradClipByGlobalNorm(unittest.TestCase): @@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase): def get_dygrap_global_norm_result(self): with fluid.dygraph.guard(): - gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm) + gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) @@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase): def get_dygrap_norm_result(self): with fluid.dygraph.guard(): - norm_clip = GradientClipByNorm(self.max_norm) + norm_clip = ClipGradByNorm(self.max_norm) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) @@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase): def get_dygrap_clip_result(self): with fluid.dygraph.guard(): - value_clip = GradientClipByValue( - max=self.max_value, min=self.min_value - ) + value_clip = ClipGradByValue(max=self.max_value, min=self.min_value) p_g_var = [] for p, g in self.para_and_grad: new_p = to_variable(p) diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index 2243ae8c45602a694e1ce79e72cbc033abaf1636..b5b0b20c6f48bc841bd0dfb5f9a61449cadc93bf 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader import paddle import paddle.fluid as fluid import paddle.fluid.core as core -from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip +from paddle.nn.clip import _allow_pure_fp16_global_norm_clip paddle.enable_static() @@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # test whether the output is right when use 'set_gradient_clip' def test_old_gradient_clip(self): def func(params_grads): - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) - fluid.clip.set_gradient_clip(clip) - return fluid.clip.append_gradient_clip_ops(params_grads) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm) + paddle.nn.clip.set_gradient_clip(clip) + return paddle.nn.clip.append_gradient_clip_ops(params_grads) self.clip_gradient = func self.check_gradient_clip(fluid.CPUPlace()) @@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # test whether the output is right when use grad_clip def test_new_gradient_clip(self): def func(params_grads): - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm) return clip(params_grads) self.clip_gradient = func @@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # test whether the output is right when use grad_clip under float64 def test_new_gradient_clip_fp64(self): def func(params_grads): - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm) return clip(params_grads) self.clip_gradient = func @@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip): # invoke 'set_gradient_clip' in a wrong order def test_wrong_API_order(self): def backward_func(cost): - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) - fluid.clip.set_gradient_clip(clip) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0) + paddle.nn.clip.set_gradient_clip(clip) sgd_optimizer = fluid.optimizer.SGD( learning_rate=0.01, grad_clip=clip ) # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective sgd_optimizer.minimize(cost) # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective - fluid.clip.set_gradient_clip(clip) + paddle.nn.clip.set_gradient_clip(clip) self.backward_and_optimize = backward_func for place in self.get_places(): @@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): with fluid.program_guard( main_program=prog, startup_program=startup_program ): - clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm) + clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm) x = ( fluid.default_main_program() .global_block() @@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip): # test whether the output is right when use grad_clip def test_gradient_clip(self): def func(params_grads): - clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) + clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm) return clip(params_grads) self.clip_gradient = func @@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip): # if grad is None or not need clip def test_none_grad(self): - clip = fluid.clip.GradientClipByNorm(self.clip_norm) + clip = paddle.nn.ClipGradByNorm(self.clip_norm) x = ( fluid.default_main_program() .global_block() @@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip): # test whether the output is right when use grad_clip def test_gradient_clip(self): def func(params_grads): - clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) + clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min) return clip(params_grads) self.clip_gradient = func @@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip): # if grad is None or not need clip def test_none_grad(self): - clip = fluid.clip.GradientClipByValue(self.max, self.min) + clip = paddle.nn.ClipGradByValue(self.max, self.min) x = ( fluid.default_main_program() .global_block() @@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase): sgd_optimizer = fluid.optimizer.SGD( learning_rate=0.0, parameter_list=linear.parameters(), - grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1), + grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1), ) self.check_clip_result(loss, sgd_optimizer) @@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase): class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): def setUp(self): self.clip_norm = 0.8 - self.clip1 = fluid.clip.GradientClipByGlobalNorm( - clip_norm=self.clip_norm - ) - self.clip2 = fluid.clip.GradientClipByGlobalNorm( - clip_norm=self.clip_norm - ) + self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm) + self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm) def check_clip_result(self, loss, optimizer): # if grad is None @@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): class TestDygraphGradientClipByNorm(TestDygraphGradientClip): def setUp(self): self.clip_norm = 0.8 - self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) + self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm) def check_clip_result(self, loss, optimizer): # if grad is None @@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip): def setUp(self): self.max = 0.2 self.min = 0.1 - self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) + self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min) def check_clip_result(self, loss, optimizer): # if grad is None @@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase): params_grads.append((param, param._grad_ivar())) _, grads = zip(*params_grads) # clip grads - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8) params_grads = clip(params_grads) _, grads_clip = zip(*params_grads) # param update @@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase): params_grads.append((param, param._grad_ivar())) _, grads = zip(*params_grads) # clip grads - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1) params_grads = clip(params_grads) _, grads_clip = zip(*params_grads) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index ecb35e8eaf950cb3f88bea4fecf70c42d1f45363..54cba6eb800295e6a69c9e64be53d7798743383a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase): place = fluid.CPUPlace() with fluid.dygraph.guard(place): model = MyLayer(size, vocab_size, size) - grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) + grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001) optimizer = fluid.optimizer.AdamOptimizer( 0.001, parameter_list=model.parameters(), grad_clip=grad_clip ) @@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase): with fluid.dygraph.guard(place): model = MyLayer2(size, vocab_size, size) - grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) + grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001) optimizer = fluid.optimizer.AdamOptimizer( 0.001, parameter_list=model.parameters(), grad_clip=grad_clip ) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index cea97398d17159aa0756d5e985b77de0db772ddc..5cc7f63eb7883b1dc260445dcd4f9f1a98c28b99 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase): fluid.set_flags( {'FLAGS_sort_sum_gradient': sort_sum_gradient} ) - # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) + # grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = paddle.to_tensor(input_word) @@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase): fluid.set_flags( {'FLAGS_sort_sum_gradient': sort_sum_gradient} ) - grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) + grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py index 735f62c646b16af1a3033b00a7712591ca8b2503..ee42ce1625feccc80d8ce72a862395b1cdc6f756 100644 --- a/python/paddle/fluid/tests/unittests/test_pad_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad_op.py @@ -187,9 +187,14 @@ class TestPaddingValueTensor3(unittest.TestCase): x = paddle.assign(np_x).astype('float32') pad_value = paddle.assign([0.0]).astype('float64') y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value) + loss = y.sum() + optimize_ops, params_grads = paddle.optimizer.SGD(0.01).minimize( + loss + ) exe = paddle.static.Executor(paddle.CPUPlace()) - [pd_out] = exe.run(main_prog, fetch_list=[y]) + res = exe.run(main_prog, fetch_list=[y] + [g for p, g in params_grads]) + pd_out = res[0] np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0) np.testing.assert_allclose(pd_out, np_out) diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index c1ff35222d8ae6dceed96fc31701b37c87070db3..e39648285daba775614a67c8de6ff920f89cb4f8 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -293,9 +293,6 @@ class TestFLOPSAPI(unittest.TestCase): ) == 3 * 12 * 12 * 12 * 2 * 8 ) - self.assertTrue( - flops('relu', {'X': [[12, 12, 12]]}, {}) == 12 * 12 * 12 - ) self.assertTrue( flops('softmax', {'X': [[12, 12, 12]]}, {}) == 3 * 12 * 12 * 12 ) @@ -303,6 +300,56 @@ class TestFLOPSAPI(unittest.TestCase): flops('c_embedding', {'Ids': [[12, 12]], 'W': [[12, 12, 3]]}, {}) == 0 ) + self.assertTrue( + flops( + 'elu', + { + 'X': [[12, 12]], + }, + {}, + ) + == 144 + ) + self.assertTrue( + flops( + 'leaky_relu', + { + 'X': [[12, 12]], + }, + {}, + ) + == 144 + ) + self.assertTrue( + flops( + 'prelu', + { + 'X': [[12, 12]], + }, + {}, + ) + == 144 + ) + self.assertTrue( + flops( + 'relu6', + { + 'X': [[12, 12]], + }, + {}, + ) + == 144 + ) + self.assertTrue( + flops( + 'silu', + { + 'X': [[12, 12]], + }, + {}, + ) + == 144 + ) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index a31749d744aead800a038300b3eeafad51b175c7..887ce9ff3f7411bb01115e8d648b53be0ec7de31 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -49,20 +49,20 @@ class TestReshapeOp(OpTest): class TestReshapeOp_ZeroDim1(OpTest): def init_data(self): self.ori_shape = () - self.new_shape = 1 - self.infered_shape = 1 + self.new_shape = (1,) + self.infered_shape = (1,) class TestReshapeOp_ZeroDim2(OpTest): def init_data(self): self.ori_shape = () - self.new_shape = -1 - self.infered_shape = 1 + self.new_shape = (-1,) + self.infered_shape = (1,) class TestReshapeOp_ZeroDim3(OpTest): def init_data(self): - self.ori_shape = 1 + self.ori_shape = (1,) self.new_shape = () self.infered_shape = () diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index c99aabbf9b5a1492453f9c17bd548cd6a84ed508..19a4711fc5b680cc5a86d4d9729cea233dcd43e3 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -454,6 +454,15 @@ class TestSundryAPI(unittest.TestCase): paddle.disable_static() self.x = paddle.rand([]) + def test_flip(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.flip(x, axis=[]) + out.backward() + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + def test_linear(self): x = paddle.randn([3, 2]) w = paddle.full(shape=[2, 4], fill_value=0.5) @@ -747,6 +756,105 @@ class TestSundryAPI(unittest.TestCase): np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy()) np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1)) + def test_reshape_list(self): + x = paddle.rand([]) + x.stop_gradient = False + + out = paddle.reshape(x, []) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + out = paddle.reshape(x, [1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + out = paddle.reshape(x, [-1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + out = paddle.reshape(x, [-1, 1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1, 1]) + self.assertEqual(out.grad.shape, [1, 1]) + + def test_reshape_tensor(self): + x = paddle.rand([1, 1]) + x.stop_gradient = False + + out = paddle.reshape(x, []) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + new_shape = paddle.full([1], 1, "int32") + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + new_shape = paddle.full([1], -1, "int32") + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1, 1]) + self.assertEqual(out.grad.shape, [1, 1]) + + def test_reshape__list(self): + x = paddle.rand([]) + out = paddle.reshape_(x, []) + self.assertEqual(out.shape, []) + + out = paddle.reshape_(x, [1]) + self.assertEqual(out.shape, [1]) + + out = paddle.reshape_(x, [-1]) + self.assertEqual(out.shape, [1]) + + out = paddle.reshape_(x, [-1, 1]) + self.assertEqual(out.shape, [1, 1]) + + def test_reshape__tensor(self): + x = paddle.rand([1, 1]) + out = paddle.reshape_(x, []) + self.assertEqual(out.shape, []) + + new_shape = paddle.full([1], 1, "int32") + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1]) + + new_shape = paddle.full([1], -1, "int32") + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1]) + + new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1, 1]) + + def test_reverse(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.reverse(x, axis=[]) + out.backward() + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + def test_sort(self): x1 = paddle.rand([]) x2 = paddle.rand([]) @@ -789,6 +897,18 @@ class TestSundryAPIStatic(unittest.TestCase): paddle.enable_static() self.exe = paddle.static.Executor() + @prog_scope() + def test_flip(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.flip(x, axis=[]) + paddle.static.append_backward(out) + + program = paddle.static.default_main_program() + res1, res2 = self.exe.run(program, fetch_list=[x, out]) + self.assertEqual(res1.shape, ()) + self.assertEqual(res2.shape, ()) + @prog_scope() def test_pow_factor(self): x = paddle.rand([]) @@ -1027,6 +1147,7 @@ class TestSundryAPIStatic(unittest.TestCase): np.testing.assert_array_equal(out3_2, np.asarray(1)) @prog_scope() +<<<<<<< HEAD def test_sort(self): x1 = paddle.rand([]) x1.stop_gradient = False @@ -1061,6 +1182,78 @@ class TestSundryAPIStatic(unittest.TestCase): self.assertEqual(res[0].shape, ()) self.assertEqual(res[1].shape, ()) +======= + def test_reshape_list(self): + x1 = paddle.rand([]) + x2 = paddle.rand([]) + x3 = paddle.rand([]) + x4 = paddle.rand([]) + x1.stop_gradient = False + x2.stop_gradient = False + x3.stop_gradient = False + x4.stop_gradient = False + + out1 = paddle.reshape(x1, []) + paddle.static.append_backward(out1) + + out2 = paddle.reshape(x2, [1]) + paddle.static.append_backward(out2) + + out3 = paddle.reshape(x3, [-1]) + paddle.static.append_backward(out3) + + out4 = paddle.reshape(x4, [-1, 1]) + paddle.static.append_backward(out4) + + program = paddle.static.default_main_program() + res1, res2, res3, res4 = self.exe.run( + program, fetch_list=[out1, out2, out3, out4] + ) + self.assertEqual(res1.shape, ()) + self.assertEqual(res2.shape, (1,)) + self.assertEqual(res3.shape, (1,)) + self.assertEqual(res4.shape, (1, 1)) + + @prog_scope() + def test_reshape_tensor(self): + x1 = paddle.rand([]) + x2 = paddle.rand([]) + x3 = paddle.rand([]) + x1.stop_gradient = False + x2.stop_gradient = False + x3.stop_gradient = False + + new_shape = paddle.full([1], 1, "int32") + out1 = paddle.reshape(x1, new_shape) + paddle.static.append_backward(out1) + + new_shape = paddle.full([1], -1, "int32") + out2 = paddle.reshape(x2, new_shape) + paddle.static.append_backward(out2) + + new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] + out3 = paddle.reshape(x3, new_shape) + paddle.static.append_backward(out3) + + program = paddle.static.default_main_program() + res1, res2, res3 = self.exe.run(program, fetch_list=[out1, out2, out3]) + self.assertEqual(res1.shape, (1,)) + self.assertEqual(res2.shape, (1,)) + self.assertEqual(res3.shape, (1, 1)) + + @prog_scope() + def test_reverse(self): + x = paddle.rand([]) + x.stop_gradient = False + + out = paddle.reverse(x, axis=[]) + paddle.static.append_backward(out) + + program = paddle.static.default_main_program() + res1, res2 = self.exe.run(program, fetch_list=[x, out]) + self.assertEqual(res1.shape, ()) + self.assertEqual(res2.shape, ()) +>>>>>>> c123dd1e4032efdbfff0bf0c35a58155f2d6e1d9 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py index 9efb334ac7dd5e0618491c98aee1ae0e2c5a83e7..e4982c42e4e100a3008c9431621c505a042d237e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py @@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase): input_data = np.random.random((2, 4)).astype("float32") def test_Variable(): - fluid.layers.clip(x=input_data, min=-1.0, max=1.0) + paddle.clip(x=input_data, min=-1.0, max=1.0) self.assertRaises(TypeError, test_Variable) def test_dtype(): x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') - fluid.layers.clip(x=x2, min=-1.0, max=1.0) + paddle.clip(x=x2, min=-1.0, max=1.0) self.assertRaises(TypeError, test_dtype) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py index ebd4354593dba2a66980eb9f6dc30ed5f78fce16..8cb27ecf0992ad534b590946a1734d4947340e5e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py @@ -556,6 +556,96 @@ class TestSundryAPI(unittest.TestCase): np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy()) np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1)) + def test_reshape_list(self): + x = paddle.rand([]) + x.stop_gradient = False + + out = paddle.reshape(x, []) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + out = paddle.reshape(x, [1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + out = paddle.reshape(x, [-1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + out = paddle.reshape(x, [-1, 1]) + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, [1, 1]) + self.assertEqual(out.grad.shape, [1, 1]) + + def test_reshape_tensor(self): + x = paddle.rand([1, 1]) + x.stop_gradient = False + + out = paddle.reshape(x, []) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + new_shape = paddle.full([], 1, "int32") + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + new_shape = paddle.full([], -1, "int32") + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1]) + self.assertEqual(out.grad.shape, [1]) + + new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] + out = paddle.reshape(x, new_shape) + out.backward() + self.assertEqual(x.grad.shape, [1, 1]) + self.assertEqual(out.shape, [1, 1]) + self.assertEqual(out.grad.shape, [1, 1]) + + def test_reshape__list(self): + x = paddle.rand([]) + out = paddle.reshape_(x, []) + self.assertEqual(out.shape, []) + + out = paddle.reshape_(x, [1]) + self.assertEqual(out.shape, [1]) + + out = paddle.reshape_(x, [-1]) + self.assertEqual(out.shape, [1]) + + out = paddle.reshape_(x, [-1, 1]) + self.assertEqual(out.shape, [1, 1]) + + def test_reshape__tensor(self): + x = paddle.rand([1, 1]) + out = paddle.reshape_(x, []) + self.assertEqual(out.shape, []) + + new_shape = paddle.full([1], 1, "int32") + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1]) + + new_shape = paddle.full([1], -1, "int32") + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1]) + + new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")] + out = paddle.reshape_(x, new_shape) + self.assertEqual(out.shape, [1, 1]) + def test_sort(self): x1 = paddle.rand([]) x2 = paddle.rand([]) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 86cab526398dd4412227a1b99dd562bd4fcb1cbb..52a0f8b4b3c4f4790008ed3224a3696da1f41cda 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -1535,7 +1535,7 @@ class Model: assert isinstance( self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm), - ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently." + ), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently." self._adapter._amp_custom_lists = {} self._adapter._amp_configs = {} diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index ca4922700b8f49f4b8a3a9222ce0afcdb9228b1f..6bee79b871cd5e721be31545c4037afa6a5668ea 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -15,13 +15,14 @@ import paddle import paddle.distributed as dist from paddle.fluid import core, layers -from paddle.fluid.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.dygraph import base as imperative_base +from paddle.nn import clip +from paddle.nn.clip import ClipGradBase, _squared_l2_norm class ClipGradForMOEByGlobalNorm(ClipGradBase): r""" - The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm + The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in :math:`t\_list` , and limit it to ``clip_norm`` . @@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + merge_grad = clip.merge_selected_rows(g) + merge_grad = clip.get_tensor_from_selected_rows(merge_grad) sum_square = _squared_l2_norm(merge_grad) if sum_square.dtype == core.VarDesc.VarType.FP16: sum_square_list_fp16.append(sum_square) diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index bc2837fa2fe58f8b2e5dcaddf59e806471823b29..9aa51cd8122e68114e610714672980ba132f9629 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -16,11 +16,11 @@ import os import paddle from paddle.fluid import core, framework, unique_name -from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.fluid.executor import global_scope from paddle.fluid.framework import Variable, name_scope from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.optimizer import Optimizer +from paddle.nn import ClipGradByGlobalNorm def init_communicator(block, rank, ranks, ring_id): diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py index 3ec3dba88df2bcbf479a74c785cfbcbe970b7a4e..328b879c5aab62905fac59e752281a4c05cefc44 100644 --- a/python/paddle/jit/dy2static/convert_operators.py +++ b/python/paddle/jit/dy2static/convert_operators.py @@ -42,10 +42,12 @@ def convert_attr(x, attr): def indexable(x, code=None): if isinstance(x, Variable): return x - if hasattr(x, '__len__') and hasattr(x, '__getitem__'): - return x - if hasattr(x, '__iter__'): + elif hasattr(x, '__iter__'): return [i for i in x] + elif hasattr(x, '__len__') and hasattr( + x, '__getitem__' + ): # used for customed type and non-iterable type. + return x else: raise RuntimeError("X can't be convert into indexable.") diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index 61143175fd4af5070ab72036de7c0cc47778aa43..10eeb6319063c1468b20bc2b03c0528e82b77bf6 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -12,9 +12,1074 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define the functions to clip gradient of parameter -from ..fluid.clip import ClipGradByGlobalNorm # noqa: F401 -from ..fluid.clip import ClipGradByNorm # noqa: F401 -from ..fluid.clip import ClipGradByValue # noqa: F401 +import copy +import warnings + +import paddle +import paddle.autograd as imperative_base +from paddle import _C_ops, _legacy_C_ops +from paddle.common_ops_import import Variable, check_type, default_main_program +from paddle.fluid import core, framework, layers, unique_name +from paddle.fluid.data_feeder import check_variable_and_dtype +from paddle.framework import LayerHelper, _non_static_mode, in_dygraph_mode +from paddle.tensor.layer_function_generator import templatedoc __all__ = [] + + +@templatedoc() +def clip_by_norm(x, max_norm, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + max_norm(${max_norm_type}): ${max_norm_comment} + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tensor: + + out(${out_type}): ${out_comment} + + + Examples: + .. code-block:: python + + import paddle + from paddle.nn import clip + + input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32') + reward = clip.clip_by_norm(x=input, max_norm=1.0) + # [[0.5, 0.5], [0.5, 0.5]] + """ + + if in_dygraph_mode(): + return _C_ops.clip_by_norm(x, max_norm) + if _non_static_mode(): + return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm) + + helper = LayerHelper("clip_by_norm", **locals()) + check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm') + check_type(max_norm, 'max_norm', (float), 'clip_by_norm') + + if name is None: + name = unique_name.generate_with_ignorable_key( + ".".join([helper.name, 'tmp']) + ) + + out = helper.create_variable( + type=x.type, name=name, dtype=x.dtype, persistable=False + ) + + helper.append_op( + type="clip_by_norm", + inputs={"X": x}, + attrs={"max_norm": max_norm}, + outputs={"Out": out}, + ) + + return out + + +@templatedoc() +def merge_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + b = fluid.default_main_program().global_block() + var = b.create_var( + name="X", dtype="float32", persistable=True, + type=fluid.core.VarDesc.VarType.SELECTED_ROWS) + y = nn.merge_selected_rows(var) + """ + if in_dygraph_mode(): + return _C_ops.merge_selected_rows(x) + + if _non_static_mode(): + return _legacy_C_ops.merge_selected_rows(x) + + helper = LayerHelper("merge_selected_rows", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="merge_selected_rows", + inputs={"X": x}, + attrs={}, + outputs={"Out": out}, + ) + return out + + +@templatedoc() +def get_tensor_from_selected_rows(x, name=None): + """ + Get tensor data from input with SelectedRows type, and outputs a Tensor. + + .. code-block:: text + + input x is SelectedRows: + x.rows = [0, 5, 5, 4, 19] + x.height = 20 + x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]] + + Output is LoDTensor: + out.shape = [5, 2] + out.data = [[1, 1], + [2, 2], + [2, 2], + [3, 3], + [6, 6]] + + Args: + x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64. + name(str, optional): The default value is None. Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name` . + + Returns: + Variable: LoDTensor transformed from SelectedRows. The data type is same with input. + + Examples: + .. code-block:: python + + from paddle import nnp.py + b = fluid.default_main_program().global_block() + input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS) + out = nn.get_tensor_from_selected_rows(input) + """ + + check_type(x, 'x', Variable, 'get_tensor_from_selected_rows') + if x.type != core.VarDesc.VarType.SELECTED_ROWS: + raise TypeError( + "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS." + ) + helper = LayerHelper('get_tensor_from_selected_rows', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='get_tensor_from_selected_rows', + inputs={'X': x}, + outputs={'Out': out}, + attrs={}, + ) + return out + + +_clip_by_global_norm_using_mp_type_flag = False + + +def _clip_by_global_norm_using_mp_type(*args): + global _clip_by_global_norm_using_mp_type_flag + assert len(args) <= 1 + if len(args) == 1: + assert isinstance(args[0], bool) + old_value = _clip_by_global_norm_using_mp_type_flag + _clip_by_global_norm_using_mp_type_flag = args[0] + return old_value + else: + return _clip_by_global_norm_using_mp_type_flag + + +def _cast_to_mp_type_if_enabled(x): + if ( + x.dtype == core.VarDesc.VarType.FP16 + or x.dtype == core.VarDesc.VarType.BF16 + ) and _clip_by_global_norm_using_mp_type(): + return x.astype(core.VarDesc.VarType.FP32) + else: + return x + + +def _squared_l2_norm(x): + r""" + Return the squared L2 norm of a tensor. + """ + + x = _cast_to_mp_type_if_enabled(x) + if ( + core.is_compiled_with_xpu() + or x.dtype == core.VarDesc.VarType.FP16 + or x.dtype == core.VarDesc.VarType.BF16 + ): + square = paddle.square(x) + sum_square = paddle.sum(square) + return sum_square + + if in_dygraph_mode(): + return _C_ops.squared_l2_norm(x) + + op_type = 'squared_l2_norm' + check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type) + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(x.dtype) + + inputs = {"X": x} + outputs = {'Out': out} + helper.append_op(type=op_type, inputs=inputs, outputs=outputs) + return out + + +class BaseErrorClipAttr: + def __str__(self): + raise NotImplementedError() + + def _append_clip_op(self, block, grad_name): + raise NotImplementedError() + + +class ErrorClipByValue(BaseErrorClipAttr): + r""" + Clip tensor values to the range [min, max]. + + Given a tensor ``t`` (see Examples below), this operation clips its value \ + to ``min`` and ``max`` inplace. + + - Any values less than min are set to min. + - Any values greater than max are set to max. + + Args: + max (float): The maximum value to clip by. + min (float, optional): The minimum value to clip by. if not set by user, \ + will be set to ``-max`` by framework. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + paddle.enable_static() + BATCH_SIZE = 128 + CLIP_MAX = 2e-6 + CLIP_MIN = -1e-6 + prog = fluid.framework.Program() + with fluid.program_guard(main_program=prog): + image = fluid.layers.data( + name='x', shape=[784], dtype='float32') + hidden1 = fluid.layers.fc(input=image, size=128, act='relu') + hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + predict = fluid.layers.fc( + input=hidden2, size=10, act='softmax') + label = fluid.layers.data(name='y', shape=[1], dtype='int64') + cost = paddle.nn.functional.cross_entropy(input=predict, label=label) + avg_cost = paddle.mean(cost) + prog_clip = prog.clone() + prog_clip.block(0).var(hidden1.name)._set_error_clip( + paddle.nn.clip.ErrorClipByValue( + max=CLIP_MAX, min=CLIP_MIN) + ) + """ + + def __init__(self, max, min=None): + max = float(max) + if min is None: + min = -max + else: + min = float(min) + self.max = max + self.min = min + + def __str__(self): + return "ByValue, min=%f, max=%f" % (self.min, self.max) + + def _append_clip_op(self, block, grad_name): + clip_op_desc = block.desc.append_op() + clip_op_desc.set_type("clip") + clip_op_desc.set_input("X", [grad_name]) + clip_op_desc.set_output("Out", [grad_name]) + clip_op_desc._set_attr("min", self.min) + clip_op_desc._set_attr("max", self.max) + + +def error_clip_callback(block, context): + # the context is a grad_to_var map + grad_to_var = context + op_desc = block.desc.op(block.desc.op_size() - 1) + for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]: + fwd_var = block._var_recursive(grad_to_var[grad_n]) + error_clip = getattr(fwd_var, "error_clip", None) + if not ( + error_clip is None or isinstance(error_clip, BaseErrorClipAttr) + ): + raise TypeError( + "Variable's error_clip should be an instance of BaseErrorClipAttr or None." + ) + if error_clip is not None: + error_clip._append_clip_op(block, grad_n) + + +class ClipGradBase: + def __init__(self): + super().__init__() + + def __str__(self): + raise NotImplementedError() + + @imperative_base.no_grad() + def _dygraph_clip(self, params_grads): + raise NotImplementedError + + def _static_clip(self, params_grads): + raise NotImplementedError + + def __call__(self, params_grads): + if _non_static_mode(): + return self._dygraph_clip(params_grads) + else: + for p, g in params_grads: + if getattr(p, 'gradient_clip_attr', None) is not None: + warnings.warn( + "'set_gradient_clip' will be ineffective, because you have " + "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' " + "is redundant and you can remove it." + ) + break + return self._static_clip(params_grads) + + def _process_context(self, context, param, grad): + raise NotImplementedError() + + def _create_operators(self, param, grad): + raise NotImplementedError() + + +class ClipGradByValue(ClipGradBase): + """ + Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max]. + + - Any values less than min are set to ``min``. + + - Any values greater than max are set to ``max``. + + The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. + + Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` + (for example: :ref:`api_paddle_optimizer_SGD`). + + Note: + ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + + Args: + max (float): The maximum value to clip by. + min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` + automatically. In this case, ``max`` must be greater than 0. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) + out = linear(x) + loss = paddle.mean(out) + loss.backward() + + clip = paddle.nn.ClipGradByValue(min=-1, max=1) + sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) + sdg.step() + """ + + def __init__(self, max, min=None): + super().__init__() + if min is None: + assert max > 0.0 + min = -max + self.max = float(max) + self.min = float(min) + + def __str__(self): + return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max) + + @imperative_base.no_grad() + def _dygraph_clip(self, params_grads): + params_and_grads = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + new_grad = paddle.clip(x=g, min=self.min, max=self.max) + params_and_grads.append((p, new_grad)) + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + param_new_grad_name_dict = dict() + with framework.name_scope('gradient_clip'): + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_grad = paddle.clip(x=g, min=self.min, max=self.max) + params_and_grads.append((p, new_grad)) + param_new_grad_name_dict[p.name] = new_grad.name + _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) + return params_and_grads + + def _process_context(self, context, param, grad): + pass + + def _create_operators(self, param, grad): + new_grad = paddle.clip(x=grad, min=self.min, max=self.max) + return param, new_grad + + +class ClipGradByNorm(ClipGradBase): + r""" + Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` . + + - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio. + + - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done. + + The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. + + Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` + (for example: :ref:`api_paddle_optimizer_SGD`). + + The clipping formula is: + + .. math:: + Out = + \left\{ + \begin{array}{ccl} + X & & if (norm(X) \leq clip\_norm) \\ + \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\ + \end{array} + \right. + + + where :math:`norm(X)` represents the L2 norm of :math:`X`. + + .. math:: + norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}} + + Note: + ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + + Args: + clip_norm(float): The maximum norm value. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) + out = linear(x) + loss = paddle.mean(out) + loss.backward() + + clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) + sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) + sdg.step() + """ + + def __init__(self, clip_norm): + super().__init__() + self.clip_norm = float(clip_norm) + + def __str__(self): + return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm + + @imperative_base.no_grad() + def _dygraph_clip(self, params_grads): + params_and_grads = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + new_grad = clip_by_norm(x=g, max_norm=self.clip_norm) + params_and_grads.append((p, new_grad)) + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + with framework.name_scope('gradient_clip'): + param_new_grad_name_dict = dict() + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_grad = clip_by_norm(x=g, max_norm=self.clip_norm) + param_new_grad_name_dict[p.name] = new_grad.name + params_and_grads.append((p, new_grad)) + _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) + return params_and_grads + + def _process_context(self, context, param, grad): + pass + + def _create_operators(self, param, grad): + new_grad = clip_by_norm(x=grad, max_norm=self.clip_norm) + return param, new_grad + + +_allow_pure_fp16_global_norm_clip_flag = False + + +def _allow_pure_fp16_global_norm_clip(*args): + global _allow_pure_fp16_global_norm_clip_flag + if len(args) == 0: + return _allow_pure_fp16_global_norm_clip_flag + else: + assert len(args) == 1 and isinstance(args[0], bool) + old_value = _allow_pure_fp16_global_norm_clip_flag + _allow_pure_fp16_global_norm_clip_flag = args[0] + return old_value + + +class ClipGradByGlobalNorm(ClipGradBase): + r""" + Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in + :math:`t\_list` , and limit it to ``clip_norm`` . + + - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio. + + - If the global norm is less than or equal to ``clip_norm`` , nothing will be done. + + The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. + + Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` + (for example: :ref:`api_paddle_optimizer_SGD`). + + The clipping formula is: + + .. math:: + + t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)} + + where: + + .. math:: + + global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} + + Note: + ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + + Args: + clip_norm (float): The maximum norm value. + group_name (str, optional): The group name for this clip. Default value is ``default_group``. + auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) + out = linear(x) + loss = paddle.mean(out) + loss.backward() + + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) + sdg.step() + """ + + def __init__( + self, clip_norm, group_name="default_group", auto_skip_clip=False + ): + super().__init__() + self.clip_norm = float(clip_norm) + self.group_name = group_name + assert isinstance(auto_skip_clip, bool) + self.auto_skip_clip = auto_skip_clip + + def __str__(self): + return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) + + @imperative_base.no_grad() + def _dygraph_clip(self, params_grads): + params_and_grads = [] + sum_square_list = [] + sum_square_list_fp16 = [] + sum_square_list_fp32 = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + continue + merge_grad = g + + if in_dygraph_mode() and g.is_selected_rows(): + merge_grad = merge_selected_rows(g) + merge_grad = merge_grad._get_tensor_from_selected_rows() + + elif g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = merge_selected_rows(g) + merge_grad = get_tensor_from_selected_rows(merge_grad) + + sum_square = _squared_l2_norm(merge_grad) + if ( + sum_square.dtype == core.VarDesc.VarType.FP16 + or sum_square.dtype == core.VarDesc.VarType.BF16 + ): + sum_square_list_fp16.append(sum_square) + elif sum_square.dtype == core.VarDesc.VarType.FP32: + sum_square_list_fp32.append(sum_square) + else: + sum_square_list.append(sum_square) + + # all parameters have been filterd out + if ( + len(sum_square_list) + + len(sum_square_list_fp16) + + len(sum_square_list_fp32) + == 0 + ): + return params_grads + + sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" + global_norm_var = [] + if len(sum_square_list_fp16) > 0: + global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16) + global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) + if len(sum_square_list_fp32) > 0: + global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32) + if sum_dtype == 'float32': + global_norm_var.append(global_norm_var_fp32) + else: + global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) + if len(sum_square_list) > 0: + global_norm_var_fp64 = paddle.add_n(sum_square_list) + global_norm_var.append(global_norm_var_fp64) + global_norm_var = paddle.add_n(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) + max_global_norm = paddle.full( + shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm + ) + + need_clip = False + if not self.auto_skip_clip: # always apply clip + need_clip = True + clip_var = paddle.divide( + x=max_global_norm, + y=paddle.maximum(x=global_norm_var, y=max_global_norm), + ) + elif global_norm_var > max_global_norm: + # only when global_norm_var > max_global_norm, grad need clip + need_clip = True + clip_var = paddle.divide(x=max_global_norm, y=global_norm_var) + + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + # TODO(wangxi): use inplace elementwise_mul + if need_clip: + clip_input = ( + clip_var.astype(g.dtype) + if clip_var.dtype != g.dtype + else clip_var + ) + new_grad = paddle.multiply(g, clip_input) + params_and_grads.append((p, new_grad)) + else: + params_and_grads.append((p, g)) + + return params_and_grads + + def _static_clip(self, params_grads): + params_and_grads = [] + sum_square_list = [] + sum_square_list_fp16 = [] + sum_square_list_fp32 = [] + with framework.name_scope('gradient_clip'): + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + continue + merge_grad = g + with p.block.program._optimized_guard([p, g]): + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = merge_selected_rows(g) + merge_grad = get_tensor_from_selected_rows(merge_grad) + sum_square = _squared_l2_norm(merge_grad) + if sum_square.dtype == core.VarDesc.VarType.FP16: + sum_square_list_fp16.append(sum_square) + elif sum_square.dtype == core.VarDesc.VarType.FP32: + sum_square_list_fp32.append(sum_square) + else: + sum_square_list.append(sum_square) + + # all parameters have been filterd out + if ( + len(sum_square_list) + + len(sum_square_list_fp16) + + len(sum_square_list_fp32) + == 0 + ): + return params_grads + + with p.block.program._optimized_guard([p, g]): + sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" + + global_norm_var = [] + if len(sum_square_list_fp16) > 0: + global_norm_var_fp16 = layers.sums(sum_square_list_fp16) + if ( + sum_square_list_fp32 + or sum_square_list + or not _allow_pure_fp16_global_norm_clip() + ): + global_norm_var.append( + global_norm_var_fp16.astype(sum_dtype) + ) + else: + global_norm_var.append(global_norm_var_fp16) + if len(sum_square_list_fp32) > 0: + global_norm_var_fp32 = layers.sums(sum_square_list_fp32) + if sum_dtype == 'float32': + global_norm_var.append(global_norm_var_fp32) + else: + global_norm_var.append( + global_norm_var_fp32.astype(sum_dtype) + ) + if len(sum_square_list) > 0: + # fp64 + global_norm_var_other_dtype = layers.sums(sum_square_list) + global_norm_var.append(global_norm_var_other_dtype) + + global_norm_var = ( + layers.sums(global_norm_var) + if len(global_norm_var) > 1 + else global_norm_var[0] + ) + global_norm_var = paddle.sqrt(x=global_norm_var) + max_global_norm = paddle.full( + shape=[1], + dtype=global_norm_var.dtype, + fill_value=self.clip_norm, + ) + scale_var = paddle.divide( + x=max_global_norm, + y=paddle.maximum(x=max_global_norm, y=global_norm_var), + ) + param_new_grad_name_dict = dict() + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + + with p.block.program._optimized_guard([p, g]): + new_g = _cast_to_mp_type_if_enabled(g) + # inplace + scale_input = ( + scale_var.astype('float16') + if new_g.dtype == core.VarDesc.VarType.FP16 + and scale_var.dtype != core.VarDesc.VarType.FP16 + else scale_var + ) + # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g + # will be in different blocks with the gradient clip related ops. + # We need to handle the correct block, otherwise will encounter + # a 'NotFoundError' during compile time. + block = default_main_program().current_block() + block.append_op( + type='elementwise_mul', + inputs={'X': new_g, 'Y': scale_input}, + outputs={'Out': new_g}, + ) + if new_g is not g: + block.append_op( + type='cast', + inputs={'X': new_g}, + outputs={'Out': g}, + attrs={ + 'in_dtype': new_g.dtype, + 'out_dtype': g.dtype, + }, + ) + + param_new_grad_name_dict[p.name] = g.name + params_and_grads.append((p, g)) + + _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) + return params_and_grads + + def _process_context(self, context, param, grad): + if self.group_name not in context: + context[self.group_name] = [] + context[self.group_name + "_clip_value"] = self.clip_norm + context[self.group_name + "_clip"] = paddle.full( + shape=[1], dtype=grad.dtype, fill_value=self.clip_norm + ) + else: + if not self.clip_norm == context[self.group_name + "_clip_value"]: + raise ValueError( + "All parameters' 'clip_norm' of a same group should be the same" + ) + + merge_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = merge_selected_rows(grad) + merge_grad = get_tensor_from_selected_rows(merge_grad) + + local_norm_var = _squared_l2_norm(merge_grad) + context[self.group_name].append(local_norm_var) + + self.context = context + + def _create_operators(self, param, grad): + group_scale_name = self.group_name + "_scale" + if group_scale_name not in self.context: + group_norm_var = layers.sums(input=self.context[self.group_name]) + group_norm_var = paddle.sqrt(x=group_norm_var) + clip_var = self.context[self.group_name + "_clip"] + group_scale_var = paddle.divide( + x=clip_var, + y=paddle.maximum(x=clip_var, y=group_norm_var), + ) + assert group_scale_var.shape == (1,) + self.context[group_scale_name] = group_scale_var + + # inplace + param.block.append_op( + type='elementwise_mul', + inputs={'X': grad, 'Y': self.context[group_scale_name]}, + outputs={'Out': grad}, + ) + + return param, grad + + +@framework.dygraph_not_support +def set_gradient_clip(clip, param_list=None, program=None): + """ + Warning: + + This API must be used after building network, and before ``minimize`` , + and it may be removed in future releases, so it is not recommended. + It is recommended to set ``grad_clip`` when initializing the ``optimizer`` , + this is a better method to clip gradient. There are three clipping strategies: + :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` . + + To specify parameters that require gradient clip. + + Args: + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no + gradient clipping. + param_list (list(Variable), optional): Parameters that require gradient clip. + It can be a list of parameter or a list of parameter's name. + Default None, meaning that all parameters in the program will be included. + program (Program, optional): The program where parameters are located. + Default None, meaning that using :ref:`api_fluid_default_main_program` . + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + paddle.enable_static() + + def network(): + image = fluid.data(name='image', shape=[ + None, 28], dtype='float32') + param_attr1 = fluid.ParamAttr("fc1_param") + fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1) + param_attr2 = fluid.ParamAttr("fc2_param") + fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2) + loss = paddle.mean(fc2) + return loss + + + # network 1: clip all parameter gradient + with fluid.program_guard(fluid.Program(), fluid.Program()): + loss = network() + paddle.nn.clip.set_gradient_clip( + paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0)) + sgd = fluid.optimizer.SGD(learning_rate=1e-3) + sgd.minimize(loss) + + # network 2: clip parameter gradient by name + with fluid.program_guard(fluid.Program(), fluid.Program()): + loss = network() + paddle.nn.clip.set_gradient_clip( + paddle.nn.ClipGradByValue(min=-1.0, max=1.0), + param_list=["fc1_param", "fc2_param"]) + sgd = fluid.optimizer.SGD(learning_rate=1e-3) + sgd.minimize(loss) + + # network 3: clip parameter gradient by value + with fluid.program_guard(fluid.Program(), fluid.Program()): + loss = network() + param_var1 = fluid.default_main_program().global_block().var("fc1_param") + param_var2 = fluid.default_main_program().global_block().var("fc2_param") + paddle.nn.clip.set_gradient_clip( + paddle.nn.ClipGradByValue(min=-1.0, max=1.0), + param_list=[param_var1, param_var2]) + sgd = fluid.optimizer.SGD(learning_rate=1e-3) + sgd.minimize(loss) + + # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together + with fluid.program_guard(fluid.Program(), fluid.Program()): + loss = network() + clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0) + clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0) + # Set the gradient clipping strategy: clip1 + paddle.nn.clip.set_gradient_clip(clip1) + # Set the gradient clipping strategy: clip2 + sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2) + sgd.minimize(loss) + # 'set_gradient_clip' will not take effect when setting has a conflict, + # and the gradient clipping strategy will be 'clip2' + + + """ + warnings.warn( + "Caution! 'set_gradient_clip' is not recommended " + "and may be deprecated in future! " + "We recommend a new strategy: set 'grad_clip' " + "when initializing the 'optimizer'. " + "This method can reduce the mistakes, please " + "refer to documention of 'optimizer'." + ) + + if not isinstance(clip, ClipGradBase): + raise TypeError( + "'clip' should be an instance of ClipGradBase's derived class" + ) + if program is None: + program = framework.default_main_program() + + for op in program.block(0).ops: + if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr( + "op_namescope" + ): + warnings.warn( + "'minimize' has been invoked before, this will make 'set_gradient_clip' " + "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'." + ) + break + + if param_list is None: + param_list = program.block(0).all_parameters() + if all(isinstance(elem, str) for elem in param_list): + param_list = [program.block(0).var(elem) for elem in param_list] + if not all(isinstance(elem, framework.Parameter) for elem in param_list): + raise TypeError( + "'param_list' should be a list of Parameter or basestring(parameter's name)." + ) + + for param in param_list: + param.gradient_clip_attr = copy.deepcopy(clip) + + +def append_gradient_clip_ops(param_grads): + context = dict() + for p, g in param_grads: + if g is None: + continue + with p.block.program._optimized_guard([p, g]), framework.name_scope( + 'gradient_clip' + ): + clip_attr = getattr(p, 'gradient_clip_attr', None) + if clip_attr is None: + return param_grads + if not isinstance(clip_attr, ClipGradBase): + raise TypeError( + "clip attribute should be an instance of GradientClipBase" + ) + + clip_attr._process_context(context=context, param=p, grad=g) + + res = [] + param_new_grad_name_dict = dict() + for p, g in param_grads: + if g is None: + continue + with p.block.program._optimized_guard([p, g]), framework.name_scope( + 'gradient_clip' + ): + param, new_grad = clip_attr._create_operators(param=p, grad=g) + param_new_grad_name_dict[param.name] = new_grad.name + res.append([param, new_grad]) + + _correct_clip_op_role_var(res, param_new_grad_name_dict) + return res + + +# change wrong mapping relation between param & grad in clip op +# Note: This function is sensitive to the time cost of the network with gradient clipping +# and should not be changed easily. If you must change, please test the time cost. +def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): + block_id_list = [] + if len(param_new_grad_name_dict) == 0: + return + for param, grad in params_grads: + if grad is None: + continue + block_id = param.block.idx + if block_id in block_id_list: + continue + block_id_list.append(block_id) + for op in param.block.program.global_block().ops: + if ( + op.has_attr("op_namescope") + and "gradient_clip" in op.attr("op_namescope") + and op.attr('op_role_var') + ): + param_name = op.attr('op_role_var')[0] + if param_name in param_new_grad_name_dict: + correct_p_g = [ + param_name, + param_new_grad_name_dict[param_name], + ] + op._set_attr('op_role_var', correct_p_g) + + +GradientClipBase = ClipGradBase +GradientClipByValue = ClipGradByValue +GradientClipByNorm = ClipGradByNorm +GradientClipByGlobalNorm = ClipGradByGlobalNorm diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index ceaa6e5e4a8dbc1514177390af76fc1a5ba213f0..74a97e25938ed300620dcb997205985176f74ca8 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -984,6 +984,13 @@ def conv1d_transpose( ) ) + if len(weight.shape) != 3: + raise ValueError( + 'Input weight should be 3D tensor, but received weight with the shape of {}'.format( + weight.shape + ) + ) + op_type = 'conv2d_transpose' num_filters = weight.shape[1] if ( diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index f03b3af2df97084e1bc2e5bd9d67b1442a19d3ee..a4d304b451e7b3cad3fdab97bf05e7854146a260 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -20,10 +20,10 @@ import paddle from .. import _C_ops from ..fluid import core, framework, unique_name -from ..fluid.clip import GradientClipBase from ..fluid.dygraph import base as imperative_base from ..fluid.framework import Parameter, Variable from ..fluid.layer_helper import LayerHelper +from ..nn.clip import GradientClipBase from .lr import LRScheduler from .optimizer import Optimizer diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index d5f18130a4c63e0883638773cf015872d2b22288..1799461254ced546eb35ac119d0cf893169c854e 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -18,6 +18,7 @@ from collections import defaultdict import numpy as np import paddle +import paddle.autograd as imperative_base from paddle import _C_ops from paddle.fluid import core from paddle.fluid.framework import ( @@ -32,12 +33,6 @@ from paddle.fluid.framework import ( from ..fluid import framework, unique_name from ..fluid.backward import _get_no_grad_set_name, append_backward -from ..fluid.clip import ( - GradientClipBase, - append_gradient_clip_ops, - error_clip_callback, -) -from ..fluid.dygraph import base as imperative_base from ..fluid.framework import Parameter, program_guard from ..fluid.initializer import Constant from ..fluid.layer_helper import LayerHelper @@ -168,7 +163,7 @@ class Optimizer: """ - @imperative_base.no_grad + @imperative_base.no_grad() def __init__( self, learning_rate, @@ -225,7 +220,7 @@ class Optimizer: % type(learning_rate) ) if grad_clip is not None: - if not isinstance(grad_clip, GradientClipBase): + if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase): raise TypeError( "'grad_clip' should be an instance of GradientClipBase's derived class" ) @@ -1042,7 +1037,7 @@ class Optimizer: params_grads.append((parameter_list[index], grad)) else: if callbacks is None: - callbacks = [error_clip_callback] + callbacks = [paddle.nn.clip.error_clip_callback] else: assert isinstance(callbacks, list) program = loss.block.program @@ -1103,7 +1098,7 @@ class Optimizer: params_grads = self._grad_clip(params_grads) else: - params_grads = append_gradient_clip_ops(params_grads) + params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads) # Add regularization if any params_grads = self.append_regularization_ops( @@ -1317,7 +1312,7 @@ class Optimizer: else: core.clear_gradients(param_list, set_to_zero) - @imperative_base.no_grad + @imperative_base.no_grad() def minimize( self, loss, startup_program=None, parameters=None, no_grad_set=None ): @@ -1380,7 +1375,7 @@ class Optimizer: return optimize_ops, params_grads - @imperative_base.no_grad + @imperative_base.no_grad() @framework.dygraph_only def step(self): """ diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 9b6d0fecf617247de4cbb2237db85923c23f1b8f..842deaac991a9c0ef006d21175e86e6c7b5767a4 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -3450,7 +3450,7 @@ def reshape(x, shape, name=None): Args: x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool`` shape (list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1. - The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. + The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape []. If ``shape`` is an Tensor, it should be an 1-D Tensor . name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -3574,10 +3574,6 @@ def reshape(x, shape, name=None): shape.stop_gradient = True inputs["Shape"] = shape elif isinstance(shape, (list, tuple)): - assert len(shape) > 0, ( - "The size of 'shape' in reshape can't be zero, " - "but received %s." % len(shape) - ) attrs["shape"] = get_attr_shape(shape) if utils._contain_var(shape): inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape) diff --git a/python/paddle/utils/flops.py b/python/paddle/utils/flops.py index a930e0ef5488d8a492b76a69cd02c633c695f0ed..71f54ee29cbe9f29ef35a3ee836a875feb83d73e 100644 --- a/python/paddle/utils/flops.py +++ b/python/paddle/utils/flops.py @@ -73,7 +73,7 @@ def _c_embedding_flops(input_shapes, attrs): def _dropout_flops(input_shapes, attrs): """FLOPs computation for dropout op. For dropout(input): - equation: flops = 0 + equation: flops = 0 """ return 0 @@ -191,7 +191,7 @@ def _matmul_v2_flops(input_shapes, attrs): """FLOPs computation for matmul_v2 op. For matmul_v2(input,other): input_shapes = [shape_of_input, shape_of_ohther] - shape_of_input = [dim1, dim2 ...dim_n_1, dim_n] length:n + shape_of_input = [dim1, dim2 ...dim_n_1, dim_n] length:n shape_of_other = [odim1, odim2 ... odim(n-m) ... odim_m_1, dim_m] length:m suppose n > m and dim_n = odim_m_1: shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1))...dim_n_1, dim_m] @@ -216,13 +216,43 @@ def _matmul_v2_flops(input_shapes, attrs): return 2 * macs -@register_flops("relu") -def _relu_flops(input_shapes, attrs): - """FLOPs computation for relu op. - For relu(input): +def _relu_class_flops(input_shapes, attrs): + """FLOPs computation for relu_like ops. + For elu/leaky_relu/prelu/relu/relu6/silu (input): equation: flops = (numel)total number of elements in the input tensor. """ - return prod(input_shapes.get('X')[0]) + input = input_shapes.get('X')[0] + return prod(input) + + +@register_flops("elu") +def _elu_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) + + +@register_flops("leaky_relu") +def _leaky_relu_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) + + +@register_flops("prelu") +def _prelu_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) + + +@register_flops("relu") +def _relu_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) + + +@register_flops("relu6") +def _relu6_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) + + +@register_flops("silu") +def _silu_flops(input_shapes, attrs): + return _relu_class_flops(input_shapes, attrs) @register_flops("reshape2") diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py index 8cdf32f348f090b14b7d33284b400f708f611d1c..b3b1df8afe29a5ceecf57ee5d43d3cdba522cc96 100644 --- a/tools/get_single_test_cov.py +++ b/tools/get_single_test_cov.py @@ -12,12 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import re +import subprocess import sys def getFNDAFile(rootPath, test): + # load base fnda + fnda_base_dict = {} + find_file_cmd = os.popen("find %s -name %s.cc" % (rootPath, test)) + if find_file_cmd.read() != "": + print("%s is a c++ unittest" % test) + with open( + "%s/build/ut_map/simple_precision_test/base_fnda.json" % rootPath, + 'r', + ) as load_f: + fnda_base_dict = json.load(load_f) + # analyse fnda filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test) fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test) os.system('touch %s' % fn_filename) @@ -27,15 +40,28 @@ def getFNDAFile(rootPath, test): except FileNotFoundError: print("%s is not found." % filename) return - lines = f.readlines() - for line in lines: - line = line.replace('\n', '') - if line.startswith(('SF:')): - os.system('echo %s >> %s' % (line, fn_filename)) - elif line.startswith(('FNDA:')): - hit = int(line.split('FNDA:')[1].split(',')[0]) - if hit != 0: - os.system('echo %s >> %s' % (line, fn_filename)) + all_data = f.read().split('TN:') + del all_data[0] + for gcov_data in all_data: + message_list = gcov_data.split('\n') + os.system('echo %s >> %s' % (message_list[1], fn_filename)) + if 'FNH:0' not in gcov_data: + for message in message_list: + if message.startswith(('FNDA:')) and ( + not message.startswith(('FNDA:0,')) + ): + tmp_data = message.split('FNDA:')[1].split(',') + hit = int(tmp_data[0]) + symbol = tmp_data[1] + if symbol in fnda_base_dict: + if (hit - fnda_base_dict[symbol]) > 0: + fnda_str = 'FNDA:%s,%s' % ( + str(hit - fnda_base_dict[symbol]), + symbol, + ) + os.system('echo %s >> %s' % (fnda_str, fn_filename)) + else: + os.system('echo %s >> %s' % (message, fn_filename)) f.close() @@ -112,10 +138,55 @@ def analysisFNDAFile(rootPath, test): f.close() +def getBaseFnda(rootPath, test): + filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test) + try: + f = open(filename) + print("oepn %s succesfully" % filename) + except FileNotFoundError: + print("%s is not found." % filename) + symbol_fnda = {} + all_data = f.read().split('TN:') + del all_data[0] + for gcov_data in all_data: + message_list = gcov_data.split('\n') + # only for cc file + if ".cc" in message_list[1]: + for message in message_list: + if message.startswith(('FNDA:')) and ( + not message.startswith(('FNDA:0,')) + ): + tmp_data = message.split('FNDA:')[1].split(',') + symbol_fnda[tmp_data[1]] = int(tmp_data[0]) + f.close() + + with open("%s/build/ut_map/%s/base_fnda.json" % (rootPath, test), "w") as f: + json.dump(symbol_fnda, f, indent=4) + + def getCovinfo(rootPath, test): ut_map_path = '%s/build/ut_map/%s' % (rootPath, test) + print("start get fluid ===>") + cmd_fluid = 'lcov --capture -d ./paddle/fluid/ -o ./paddle/fluid/coverage_fluid.info --rc lcov_branch_coverage=0' + p_fluid = subprocess.Popen(cmd_fluid, shell=True, stdout=subprocess.DEVNULL) + + print("start get phi ===>") + cmd_phi = 'lcov --capture -d ./paddle/phi -o ./paddle/phi/coverage_phi.info --rc lcov_branch_coverage=0' + p_phi = subprocess.Popen(cmd_phi, shell=True, stdout=subprocess.DEVNULL) + + print("start get utils ===>") + cmd_utils = 'lcov --capture -d ./paddle/utils -o ./paddle/utils/coverage_utils.info --rc lcov_branch_coverage=0' + p_utils = subprocess.Popen(cmd_utils, shell=True, stdout=subprocess.DEVNULL) + + print("start wiat fluid ===>") + p_fluid.wait() + print("start wiat phi ===>") + p_phi.wait() + print("start wiat utils ===>") + p_utils.wait() + print("end wait...") os.system( - 'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1' + 'cd %s && lcov -a paddle/fluid/coverage_fluid.info -a paddle/phi/coverage_phi.info -a paddle/utils/coverage_utils.info -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1' % ut_map_path ) coverage_info_path = ut_map_path + '/coverage.info' @@ -139,8 +210,11 @@ def getCovinfo(rootPath, test): os.system('rm -rf %s/paddle' % ut_map_path) os.system('rm -rf %s/coverage.info' % ut_map_path) - getFNDAFile(rootPath, test) - analysisFNDAFile(rootPath, test) + if test == "simple_precision_test": + getBaseFnda(rootPath, test) + else: + getFNDAFile(rootPath, test) + analysisFNDAFile(rootPath, test) os.system('rm -rf %s/coverage.info.tmp' % ut_map_path) diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py index 1a096fa894e4630749d8773447b18226a5e607df..a33c1cd66811919b5276446af858f68c39d46ffa 100644 --- a/tools/handle_h_cu_file.py +++ b/tools/handle_h_cu_file.py @@ -72,6 +72,31 @@ def insert_pile_to_h_file(rootPath): os.system('echo "\n#endif" >> %s' % line) +def add_simple_cxx_test(rootPath): + variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath + variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath + if os.path.exists(variant_test_path) and os.path.exists( + variant_test_cmakeflie_path + ): + simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath + os.system('touch %s' % simple_test_path) + os.system( + "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path + ) + os.system( + 'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path + ) + os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path) + os.system( + 'echo " simple_precision_test" >> %s' % variant_test_cmakeflie_path + ) + os.system( + 'echo " SRCS simple_precision_test.cc" >> %s' + % variant_test_cmakeflie_path + ) + os.system('echo " DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path) + + def remove_pile_from_h_file(rootPath): h_cu_files = '%s/tools/h_cu_files.log' % rootPath f = open(h_cu_files) @@ -130,6 +155,7 @@ if __name__ == "__main__": elif func == 'insert_pile_to_h_file': rootPath = sys.argv[2] insert_pile_to_h_file(rootPath) + add_simple_cxx_test(rootPath) elif func == 'analy_h_cu_file': dir_path = sys.argv[2] rootPath = sys.argv[3] diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh index 011ac564cf91ba41b4c851ce7c52187658c359b8..3c6f0e140f250adab7e4f5c6ca1c1ba714473657 100755 --- a/tools/nvcc_lazy.sh +++ b/tools/nvcc_lazy.sh @@ -65,12 +65,14 @@ echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1 echo -e >> $1 echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1 echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1 -echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1 -echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1 -echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1 -echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1 -echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1 -echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1 -echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1 +echo "if [ ! -z \"\$STUBF\" ]; then" >> $1 +echo " CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1 +echo " sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1 +echo " sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1 +echo " # sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1 +echo " sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1 +echo " sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1 +echo " sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1 +echo "fi" >> $1 echo "/usr/bin/env bash \${BUILDSH}.post" >> $1 echo "rm -rf \$BUILDDIR" >> $1