From 3af477112281a06bd6824945ca9e7c78015cbd66 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 20 Mar 2020 13:28:56 +0800 Subject: [PATCH] Add the detection and code-generation of sqrt and square in fusion_group (#23095) --- .../ir/fusion_group/code_generator.cc | 25 +++--- .../ir/fusion_group/code_generator_helper.cc | 8 +- .../ir/fusion_group/cuda_resources.h | 18 ++-- .../elementwise_group_detector.cc | 69 +++++++--------- .../fusion_group/elementwise_group_detector.h | 2 +- .../ir/fusion_group/fusion_group_pass.cc | 4 +- .../framework/ir/fusion_group/operation.cc | 31 +++++-- .../fluid/tests/unittests/ir/pass_test.py | 7 +- .../unittests/ir/test_ir_fusion_group_pass.py | 82 +++++++++---------- 9 files changed, 128 insertions(+), 118 deletions(-) diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index af75f933925..133f1dfce7c 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -25,15 +25,21 @@ namespace ir { namespace fusion_group { std::string ExtractDataType(const std::vector& nodes) { - std::string dtype_str = "float"; - auto data_type = nodes.back()->Var()->GetDataType(); - - if (data_type == proto::VarType::FP32) { - dtype_str = "float"; - } else if (data_type == proto::VarType::FP64) { - dtype_str = "double"; - } else if (data_type == proto::VarType::FP16) { - dtype_str = "float16"; + std::string dtype_str = ""; + for (const auto* n : nodes) { + if (n && n->IsVar() && n->Var()) { + // The data type of all inputs/outputs must be the same, which are + // checked when detecting the subgraph. + auto dtype = n->Var()->GetDataType(); + if (dtype == proto::VarType::FP32) { + dtype_str = "float"; + } else if (dtype == proto::VarType::FP64) { + dtype_str = "double"; + } else if (dtype == proto::VarType::FP16) { + dtype_str = "float16"; + } + break; + } } return dtype_str; @@ -80,7 +86,6 @@ std::vector CodeGenerator::ConvertToExpressions( for (auto& name : input_names) { // Some input vars are not used in grad ops, such as // "elementwise_add_grad", where "X", "Y" and "Out" are not used. - if ((HasInput(node, name) && op->Input(name).size() >= 1U)) { for (size_t i = 0; i < op->Input(name).size(); i++) { PADDLE_ENFORCE_NE( diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc index e9ed38cac4d..573f4c4de3e 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc @@ -38,13 +38,13 @@ static std::string ExpandMultivariateTemplate(const std::string rhs, int start_pos = rhs.find("[", 0); int end_pos = rhs.find("]", 0); std::string sum_rhs = rhs.substr(0, start_pos); - std::string sum_rhs_component = + std::string repeated_component = rhs.substr(start_pos + 1, (end_pos - start_pos - 1)); - int replace_pos = sum_rhs_component.find("?", 0); + int replace_pos = repeated_component.find("?", 0); for (size_t i = 1; i < input_size; i++) { - std::string append_str = - sum_rhs_component.replace(replace_pos, 1, std::to_string(i)); + std::string append_str = repeated_component; + append_str.replace(replace_pos, 1, std::to_string(i)); sum_rhs = sum_rhs + append_str; } return sum_rhs; diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h index e4382e205ba..fa0bd96ccda 100644 --- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h +++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h @@ -20,20 +20,26 @@ namespace ir { namespace fusion_group { static constexpr char predefined_cuda_functions_fp32[] = R"( -__device__ inline float real_exp(float x) { return ::expf(x); } -__device__ inline float real_log(float x) { return ::logf(x); } +__device__ inline float Max(float x, float y) { return fmaxf(x, y); } +__device__ inline float Exp(float x) { return expf(x); } +__device__ inline float Log(float x) { return logf(x); } +__device__ inline float Sqrt(float x) { return sqrtf(x); } )"; static constexpr char predefined_cuda_functions_fp64[] = R"( -__device__ inline double real_exp(double x) { return ::exp(x); } -__device__ inline double real_log(double x) { return ::log(x); } +__device__ inline double Max(double x, double y) { return fmax(x, y); } +__device__ inline double Exp(double x) { return exp(x); } +__device__ inline double Log(double x) { return log(x); } +__device__ inline double Sqrt(double x) { return sqrt(x); } )"; static constexpr char predefined_cuda_functions_fp16[] = R"( -__device__ inline float real_exp(float x) { return ::expf(x); } -__device__ inline float real_log(float x) { return ::logf(x); } +__device__ inline float Max(float x, float y) { return fmaxf(x, y); } +__device__ inline float Exp(float x) { return expf(x); } +__device__ inline float Log(float x) { return logf(x); } +__device__ inline float Sqrt(float x) { return sqrtf(x); } #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc index 93986945273..c81e9c27b6b 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc @@ -60,52 +60,41 @@ static bool IsEqualAndNotEmpty(const std::vector& l, return l.size() != 0U && r.size() != 0U && l == r; } -bool GroupDetector::IsFusionGroupOp(const Node* n) { - if (!(n && n->IsOp() && n->Op())) return false; - bool is_first = true; - proto::VarType::Type i_data_type = proto::VarType::FP32; - proto::VarType::Type o_data_type = proto::VarType::FP32; - - for (auto* i_node : n->inputs) { - if (!i_node->Var()) return false; - if (i_node->Var()->GetType() != proto::VarType::LOD_TENSOR) { - return false; - } - if (is_first) { - i_data_type = i_node->Var()->GetDataType(); - is_first = false; - } else { - if (i_data_type != i_node->Var()->GetDataType()) return false; - } - } +bool GroupDetector::CheckPrecondition(const Node* n) { + auto check_data_type = [&](const std::vector& nodes) -> bool { + bool is_first = true; + proto::VarType::Type data_type_0; + for (auto* n : nodes) { + if (n && n->IsVar() && n->Var()) { + if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) { + return false; + } - is_first = true; - for (auto* o_node : n->outputs) { - if (!o_node->Var()) return false; - if (o_node->Var()->GetType() != proto::VarType::LOD_TENSOR) { - return false; - } - if (is_first) { - o_data_type = o_node->Var()->GetDataType(); - is_first = false; - } else { - if (o_data_type != o_node->Var()->GetDataType()) return false; + proto::VarType::Type data_type_i = n->Var()->GetDataType(); + if (data_type_i == proto::VarType::FP32 || + data_type_i == proto::VarType::FP64 || + data_type_i == proto::VarType::FP16) { + if (is_first) { + data_type_0 = data_type_i; + is_first = false; + } else if (data_type_0 != data_type_i) { + return false; + } + } else { + return false; + } + } } - } - - if (!(i_data_type == proto::VarType::FP32 || - i_data_type == proto::VarType::FP64 || - i_data_type == proto::VarType::FP16) || - !(o_data_type == proto::VarType::FP32 || - o_data_type == proto::VarType::FP64 || - o_data_type == proto::VarType::FP16)) - return false; + return true; + }; - return true; + return n && n->IsOp() && n->Op() && check_data_type(n->inputs) && + check_data_type(n->outputs); } bool ElementwiseGroupDetector::IsElementwiseOp(const Node* n) { if (IsSpecifiedOp(GetElementwiseOpTypes(), n)) { + // Check whether all inputs have the same shape. std::vector shape_0; for (size_t i = 0; i < n->inputs.size(); ++i) { auto* in_i = n->inputs[i]; @@ -130,7 +119,7 @@ bool ElementwiseGroupDetector::IsElementwiseOp(const Node* n) { std::vector> ElementwiseGroupDetector::operator()( Graph* graph) { auto teller = [&](const Node* n) -> bool { - return IsFusionGroupOp(n) && IsElementwiseOp(n); + return CheckPrecondition(n) && IsElementwiseOp(n); }; return SubgraphDetector(graph, teller)(); diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h index 58601c6ad77..0861c2f7e96 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h @@ -25,7 +25,7 @@ namespace fusion_group { class GroupDetector { protected: - bool IsFusionGroupOp(const Node* n); + bool CheckPrecondition(const Node* n); }; class ElementwiseGroupDetector : GroupDetector { diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc index b672a80662e..a34b27b6418 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc @@ -33,6 +33,8 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const { fusion_group::OperationMap::Init(); int num_elementwise_groups = DetectFusionGroup(graph, 0); AddStatis(num_elementwise_groups); + LOG(INFO) << "Detect " << num_elementwise_groups + << " elementwise fusion groups."; } } @@ -54,7 +56,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n"; if (subgraph.IsValid(min_subgraph_size)) { - subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++)); + subgraph.SetFuncName("FusedElementwise" + std::to_string(index++)); if (GenerateCode(&subgraph)) { InsertFusionGroupOp(graph, &subgraph); num_subgraphs++; diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc index f6846676c72..0ff9754bb29 100644 --- a/paddle/fluid/framework/ir/fusion_group/operation.cc +++ b/paddle/fluid/framework/ir/fusion_group/operation.cc @@ -95,20 +95,29 @@ void OperationMap::InsertUnaryElementwiseOperations() { // sigmoid: // out = f(x) = 1.0 / (1.0 + exp(-x)) // dx = dout * out * (1 - out) - insert_handler("sigmoid", "1.0 / (1.0 + real_exp(- ${0}))", + insert_handler("sigmoid", "1.0 / (1.0 + Exp(- ${0}))", {"${2} * ${1} * (1.0 - ${1})"}); // tanh: // out = f(x) = 2.0 / (1.0 + exp(-2.0 * x)) - 1.0; // dx = dout * (1 - out * out) - insert_handler("tanh", "2.0 / (1.0 + real_exp(-2.0 * ${0})) - 1.0", + insert_handler("tanh", "2.0 / (1.0 + Exp(-2.0 * ${0})) - 1.0", {"${2} * (1.0 - ${1} * ${1})"}); - // cast - // out = static_cast(d) - // dx = static_cast(d_out) + // cast: + // out = static_cast(x) // TODO(wangchaochaohu): This is not the compelete definition of // cast Op, We need refine it later. - insert_handler("cast", "${0}", {"${0}"}); + insert_handler("cast", "${0}", {}); + + // sqrt: + // out = x^(1/2) + // dx = dout * 0.5 / out + insert_handler("sqrt", "Sqrt(${0})", {"${2} * 0.5 / ${1}"}); + + // square: + // out = x^2 + // dx = dout * 2.0 * x + insert_handler("square", "${0} * ${0}", {"${2} * 2.0 * ${0}"}); } void OperationMap::InsertBinaryElementwiseOperations() { @@ -168,9 +177,13 @@ void OperationMap::InsertMultivariateElementwiseOperations() { Insert(type, num_oprands, op_type, expr, grad_exprs, {"X"}, {"Out"}); }; - // here [] represent the number of input is positive(>=0). - // if input list size of Sum Op is 3, It will expand as - // ${0} + ${1} + ${2} + // sum: + // out = x_0 + x_1 + ... + x_N-1 + // + // For sum with N inputs, the expression inside "[]" will be expanded + // N - 1 times. The ${?} represents the number of inputs starting with is 1. + // For example, sum with 4 inputs, the expanded expression is: + // ${0} + ${1} + ${2} + ${3} insert_handler("sum", "${0}[ + ${?}]", {}); } diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py index 4ee3f260228..2ed574bf756 100644 --- a/python/paddle/fluid/tests/unittests/ir/pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py @@ -38,7 +38,6 @@ class PassTest(unittest.TestCase): self.pass_attrs = {} self.fused_op_type = None self.num_fused_ops = -1 - self.backward = True np.random.seed(123) random.seed(124) @@ -49,7 +48,11 @@ class PassTest(unittest.TestCase): places.append(fluid.CUDAPlace(0)) return places - def append_gradinets(self, outs): + def grad(self, var): + grad_name = var.name + "@GRAD" + return self.main_program.global_block().var(grad_name) + + def append_gradients(self, outs): with fluid.program_guard(self.main_program, self.startup_program): loss = fluid.layers.mean(outs) fluid.backward.append_backward(loss) diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py index 6724b634b7d..ef57752e87e 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py @@ -35,15 +35,12 @@ class FusionGroupPassTest(PassTest): # subgraph with 2 op nodes tmp_2 = layers.relu(tmp_0 + tmp_1) - self.num_fused_ops = 1 - self.fetch_list = [tmp_2.name, tmp_1.name + "@GRAD"] + self.append_gradients(tmp_2) - if self.backward: - self.append_gradinets(tmp_2) - self.num_fused_ops = 2 + self.num_fused_ops = 2 + self.fetch_list = [tmp_2, self.grad(tmp_1)] def setUp(self): - self.backward = True self.build_program("float32") self.feeds = self._feed_random_data(self.feed_vars) self.pass_names = "fusion_group_pass" @@ -91,13 +88,10 @@ class FusionGroupPassTest1(FusionGroupPassTest): self.feed_vars[2]) * layers.tanh(self.feed_vars[3]) tmp_2 = layers.tanh(tmp_1) + layers.sigmoid(self.feed_vars[4]) - if self.backward: - self.append_gradinets(tmp_2) - self.num_fused_ops = 2 - else: - self.num_fused_ops = 1 + self.append_gradients(tmp_2) - self.fetch_list = [tmp_2.name, tmp_0.name + "@GRAD"] + self.num_fused_ops = 2 + self.fetch_list = [tmp_2, self.grad(tmp_0)] class FusionGroupPassTest2(FusionGroupPassTest): @@ -115,20 +109,11 @@ class FusionGroupPassTest2(FusionGroupPassTest): tmp_2 = layers.relu(layers.sigmoid(self.feed_vars[3])) tmp_3 = layers.mul(tmp_1, tmp_2) - self.num_fused_ops = 2 - self.fetch_list = [tmp_3.name] - - #TODO(wangchaochaohu): we need to deal with the condition of stop gradient - if self.backward: - self.append_gradinets(tmp_3) - self.num_fused_ops = 3 + # TODO(wangchaochaohu): support the case when some vars are set + # stop_gradient = True. - def setUp(self): - self.backward = False - self.build_program("float32") - self.feeds = self._feed_random_data(self.feed_vars) - self.pass_names = "fusion_group_pass" - self.fused_op_type = "fusion_group" + self.num_fused_ops = 2 + self.fetch_list = [tmp_3] class FusionGroupPassTestFP64(FusionGroupPassTest): @@ -147,32 +132,41 @@ class FusionGroupPassTestFP16(FusionGroupPassTest): fluid.data( name="data2", shape=[128, 128], dtype=dtype)) + # subgraph with 2 op nodes tmp_0 = self.feed_vars[0] * self.feed_vars[1] - tmp_1 = layers.mul(tmp_0, self.feed_vars[2]) - tmp_3 = layers.cast(tmp_1, dtype="float16") - tmp_2 = layers.cast(tmp_0, dtype="float16") - tmp_4 = layers.relu(tmp_2 + tmp_3) + tmp_1 = layers.cast(tmp_0, dtype="float16") + tmp_2 = layers.mul(tmp_0, self.feed_vars[2]) + # subgraph with 4 op nodes + tmp_3 = layers.cast(tmp_2, dtype="float16") + tmp_4 = layers.relu(tmp_1 + tmp_3) tmp_5 = layers.cast(tmp_4, dtype=dtype) - self.num_fused_ops = 1 - self.fetch_list = [tmp_5.name] + self.append_gradients(tmp_5) - if self.backward: - self.num_fused_ops = 4 - self.append_gradinets(tmp_5) + self.num_fused_ops = 3 + self.fetch_list = [tmp_5, self.grad(tmp_0)] class FusionGroupPassSumTest(FusionGroupPassTest): def build_program(self, dtype): with fluid.program_guard(self.main_program, self.startup_program): - self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 5) + self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3) + self.feed_vars.append( + fluid.data( + name="data3", shape=[128, 128], dtype=dtype)) - tmp_0 = layers.elementwise_add(self.feed_vars[0], self.feed_vars[1]) - tmp_1 = layers.sum([tmp_0, self.feed_vars[2], self.feed_vars[3]]) - tmp_2 = layers.sum([tmp_1, self.feed_vars[4]]) + # subgraph with 2 op nodes + tmp_0 = layers.sum( + [self.feed_vars[0], self.feed_vars[1], self.feed_vars[2]]) + tmp_1 = layers.sqrt(tmp_0) + tmp_2 = layers.mul(tmp_0, self.feed_vars[3]) + # subgraph with 2 op nodes + tmp_3 = layers.square(layers.sum([tmp_1, tmp_2])) - self.fetch_list = [tmp_0, tmp_1, tmp_2] - self.num_fused_ops = 1 + self.append_gradients(tmp_3) + + self.num_fused_ops = 3 + self.fetch_list = [tmp_3, self.grad(tmp_0)] class FusionGroupPassCastTest(FusionGroupPassTest): @@ -184,12 +178,10 @@ class FusionGroupPassCastTest(FusionGroupPassTest): tmp_1 = layers.cast(tmp_0, dtype="double") tmp_2 = layers.cast(tmp_1, dtype="float32") - self.fetch_list = [tmp_2.name, tmp_1.name + "@GRAD"] - self.num_fused_ops = 1 + self.append_gradients(tmp_2) - if self.backward: - self.num_fused_ops = 2 - self.append_gradinets(tmp_2) + self.num_fused_ops = 2 + self.fetch_list = [tmp_2, self.grad(tmp_0)] def setUp(self): self.build_program("float64") -- GitLab