Support generating code for grad_op (#21066)

* Add the definition of operation in fusion_group. * Use operations in OperationMap to detect fusion_group of elementwise pattern. * Add namespace fusion_group in code_generator. * Use operations recorded in OperationMap to generate code. * Remove implementation codes to .cc file. * Refine Operation and CodeGenerator to make it easier to generate code for grad_op. Refine the unittest for better reuse. * Avoid recording the template's keyword in a array. * Support the generating of code for grad_op and add unittest. test=develop * Remove replaced_element_in_order and use use number instead. test=develop

Support generating code for grad_op (#21066)
* Add the definition of operation in fusion_group. * Use operations in OperationMap to detect fusion_group of elementwise pattern. * Add namespace fusion_group in code_generator. * Use operations recorded in OperationMap to generate code. * Remove implementation codes to .cc file. * Refine Operation and CodeGenerator to make it easier to generate code for grad_op. Refine the unittest for better reuse. * Avoid recording the template's keyword in a array. * Support the generating of code for grad_op and add unittest. test=develop * Remove replaced_element_in_order and use use number instead. test=develop
9091f8cd · Yiqun Liu · GitHub · 1cd67218 · 9091f8cd · 9091f8cd
11 changed file
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
-cc_library(code_generator SRCS code_generator.cc code_generator_helper.cc DEPS graph)
+cc_library(code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph)
 if(NOT APPLE AND NOT WIN32)
    if(WITH_GPU)
        cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor)
@@ -7,5 +7,5 @@ endif()

 cc_library(fusion_group_pass
    SRCS fusion_group_pass.cc elementwise_group_detector.cc
-    DEPS graph_pattern_detector pass)
+    DEPS graph_pattern_detector pass code_generator)
 cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass)
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -20,17 +20,82 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace fusion_group {

-CodeGenerator::CodeGenerator(CodeTemplate code_template) {
-  code_template_ = code_template;
+CodeGenerator::CodeGenerator() {
+  // Only support elementwise operations now.
+  code_templates_.resize(1);
+
+  CodeTemplate elementwise_t(elementwise_cuda_template);
+  code_templates_[0] = elementwise_t;
 }

 // In order to get the right result of expression, we need to calculate and
 // store the expression as suffix Expressions using vector.
-std::string CodeGenerator::GenerateCode(TemplateVariable template_var) {
-  auto cuda_kernel = kernel_function + code_template_.Format(template_var);
-  return cuda_kernel;
+std::string CodeGenerator::GenerateCode(
+    std::string func_name, std::vector<OperationExpression> expressions) {
+  // Check whether all expressions are elementwise operations.
+  TemplateVariable template_var;
+  template_var.Add("func_name", func_name);
+  template_var.Add("parameters", EmitParameters(expressions, "float"));
+  template_var.Add("compute_body", EmitComputeBody(expressions));
+  return predefined_cuda_functions + code_templates_[0].Format(template_var);
+}
+
+// we get the parameter list code for the expression information
+std::string CodeGenerator::EmitParameters(
+    std::vector<OperationExpression> expressions, std::string dtype) {
+  std::set<int> input_ids;
+  std::set<int> output_ids;
+  // Remove the reptead id and get a ordered list.
+  for (size_t i = 0; i < expressions.size(); i++) {
+    for (auto id : expressions[i].GetInputIds()) {
+      input_ids.insert(id);
+    }
+    for (auto id : expressions[i].GetOutputIds()) {
+      output_ids.insert(id);
+    }
+  }
+
+  // If a id is in the input and output list at the same time, then remove it
+  // from the input list.
+  for (auto iter = input_ids.begin(); iter != input_ids.end();) {
+    if (output_ids.find(*iter) != output_ids.end()) {
+      input_ids.erase(iter++);
+    } else {
+      iter++;
+    }
+  }
+
+  std::stringstream ret;
+  ret << "int N, ";
+  for (auto iter = input_ids.begin(); iter != input_ids.end(); iter++) {
+    ret << dtype << "* " << VarName(*iter) << ", ";
+  }
+
+  size_t count_index = 0;
+  for (auto iter = output_ids.begin(); iter != output_ids.end(); iter++) {
+    ret << dtype << "* " << VarName(*iter);
+    if (count_index != output_ids.size() - 1) {
+      ret << ", ";
+    }
+    count_index++;
+  }
+
+  return ret.str();
 }
+
+std::string CodeGenerator::EmitComputeBody(
+    std::vector<OperationExpression> expressions) {
+  // get the right experssion code using suffix expression
+  std::stringstream ret;
+  for (size_t i = 0; i < expressions.size(); i++) {
+    ret << expressions[i].GetExpression();
+  }
+  return ret.str();
+}
+
+}  // namespace fusion_group
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
@@ -20,19 +21,30 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace fusion_group {

 class CodeGenerator {
 public:
-  explicit CodeGenerator(CodeTemplate code_template);
+  CodeGenerator();

-  std::string GenerateCode(TemplateVariable template_var);
+  std::string GenerateCode(std::string func_name,
+                           std::vector<OperationExpression> expressions);

  // TODO(wangchao): add a more general interface
  // std::string Generate(const std::string name, const SubGraph& subgraph);

 private:
-  CodeTemplate code_template_;
+  // we get the parameter list code for the expression information
+  std::string EmitParameters(std::vector<OperationExpression> expressions,
+                             std::string dtype);
+
+  std::string EmitComputeBody(std::vector<OperationExpression> expressions);
+
+ private:
+  std::vector<CodeTemplate> code_templates_;
 };
+
+}  // namespace fusion_group
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
@@ -17,53 +17,66 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ir/fusion_group/operation.h"

 namespace paddle {
 namespace framework {
 namespace ir {
+namespace fusion_group {

-OperationExpression::OperationExpression(std::vector<int> input_ids,
-                                         int output_id, std::string op) {
-  input_ids_ = input_ids;
-  output_id_ = output_id;
-  op_ = op;
+template <typename T>
+static T StringTo(const std::string& str) {
+  std::istringstream is(str);
+  T value;
+  is >> value;
+  return value;
 }

-std::string OperationExpression::GetRHSTemplate() {
-  std::stringstream ret;
-  std::string rhs_end = ";";
-  auto rhs = support_table[op_];
-  for (size_t i = 0; i < input_ids_.size(); i++) {
-    auto replaced_str = replaced_element_in_order[i];
-    auto pos = rhs.find(replaced_str);
-    auto index = input_ids_[i];
-    rhs.replace(pos, replaced_str.length(), std::to_string(index) + R"([idx])");
+std::string OperationExpression::GetRHS(size_t i) {
+  auto rhs = OperationMap::Instance().Get(op_).exprs[i];
+  for (size_t i = 0; i < rhs.size(); i++) {
+    size_t pos = i;
+    if (rhs[pos] == '$' && rhs[pos + 1] == '{') {
+      int length = 0;
+      while (rhs[pos + 2 + length] != '}') {
+        length++;
      }
-  ret << rhs << rhs_end;
-  return ret.str();
+      std::string index_str = rhs.substr(pos + 2, length);
+      int index = StringTo<int>(index_str);
+      PADDLE_ENFORCE_LT(index, input_ids_.size(),
+                        "Only %d inputs are provided, but need %d.",
+                        input_ids_.size(), index + 1);
+      rhs.replace(pos, length + 3, VarName(input_ids_[index]) + R"([idx])");
+    }
+  }
+  return rhs;
 }

-std::string OperationExpression::GetLHSTemplate() {
+std::string OperationExpression::GetLHS(size_t i) {
  std::stringstream ret;
-  ret << "var" << output_id_ << R"([idx] = )";
+  ret << VarName(output_ids_[i]) << R"([idx])";
  return ret.str();
 }

-bool OperationExpression::SupportState() {
-  return (support_table.find(op_) == support_table.end());
+bool OperationExpression::IsSupport() {
+  return OperationMap::Instance().Has(op_);
 }

 // we Traverse the graph and get the group , all input id and output id is
 // unique for the node which belong the group
 std::string OperationExpression::GetExpression() {
  std::stringstream ret;
-  if (!SupportState()) {
-    ret << GetLHSTemplate() << GetRHSTemplate();
+  if (IsSupport()) {
+    for (size_t i = 0; i < output_ids_.size(); ++i) {
+      ret << GetLHS(i) << " = " << GetRHS(i) << ";";
+    }
  }

  return ret.str();
 }

+}  // namespace fusion_group
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -14,52 +14,43 @@ limitations under the License. */

 #pragma once

-#include <iostream>
 #include <set>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>

+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace fusion_group {
+
+static std::string VarName(int index) { return "var" + std::to_string(index); }

-static std::vector<std::string> replaced_element_in_order = {"@", "$"};
-
-static std::vector<std::string> kernel_template = {"$name", "$parameter",
-                                                   "$compute"};
-
-static std::unordered_map<std::string, std::string> support_table = {
-    {"elementwise_add", "var@ + var$"},
-    {"elementwise_sub", "var@ - var$"},
-    {"elementwise_mul", "var@ * var$"},
-    {"elementwise_div", "var@ / var$"},
-    {"elementwise_min", "real_min(var@, var$)"},
-    {"elementwise_max", "real_max(var@, var$)"},
-    {"relu", "real_max(var@, 0)"},
-    {"sigmoid", "1.0 / (1.0 + real_exp(-var@))"}};
-
-// Paddle elementwise op consist the broacast op and elementwise op
-// op computation is composed by single or many operation
-// here we only generate the simple expression code so we
-// make it simple
 class OperationExpression {
 public:
-  OperationExpression(std::vector<int> input_ids, int output_id,
-                      std::string op);
-  std::string GetExpression();
+  explicit OperationExpression(std::string op, std::vector<int> input_ids,
+                               std::vector<int> output_ids)
+      : op_(op), input_ids_(input_ids), output_ids_(output_ids) {}
+
  std::vector<int> GetInputIds() { return input_ids_; }
-  int GetOutputId() { return output_id_; }
-  bool SupportState();
-  // in oreder to make offset more flexible we add stride and basic offset
-  std::string GetRHSTemplate();
-  std::string GetLHSTemplate();
+  std::vector<int> GetOutputIds() { return output_ids_; }
+
+  // Check whether this operation type is supported in OperationMap.
+  bool IsSupport();
+
+  std::string GetExpression();
+
+  // TODO(wangchao): make offset more flexible we add stride and basic offset
+  std::string GetRHS(size_t i = 0);
+  std::string GetLHS(size_t i = 0);

 private:
-  std::vector<int> input_ids_;
-  int output_id_;
  std::string op_;
+  std::vector<int> input_ids_;
+  std::vector<int> output_ids_;
 };

 class TemplateVariable {
@@ -92,24 +83,30 @@ class CodeTemplate {

  std::string Format(TemplateVariable template_var) {
    std::string ret = template_str_;
-    std::unordered_map<std::string, std::string> identifier_str =
-        template_var.Get();
+    std::unordered_map<std::string, bool> found;

+    // Word begins with "$" in template_str will be replaced.
    for (size_t i = 0; i < ret.size(); i++) {
      auto pos = i;
      char c = ret[pos];

      if (c == '$') {
-        for (size_t j = 0; j < kernel_template.size(); j++) {
-          int template_size = kernel_template[j].size();
-          auto tmp_cmp = ret.substr(pos, template_size);
-          if (tmp_cmp == kernel_template[j]) {
-            ret.replace(pos, template_size, identifier_str[kernel_template[j]]);
+        for (auto iter : template_var.Get()) {
+          std::string keyword = iter.first;
+          if (ret.substr(pos + 1, keyword.size()) == keyword) {
+            found[keyword] = true;
+            ret.replace(pos, keyword.size() + 1, iter.second);
+            break;
          }
        }
      }
    }

+    for (auto iter : template_var.Get()) {
+      PADDLE_ENFORCE_NE(found.find(iter.first), found.end(),
+                        "Keyword %s in template is not set.", iter.first);
+    }
+
    return EmitIndents(ret);
  }

@@ -142,103 +139,33 @@ class CodeTemplate {
  std::string template_str_;
 };

-static std::string EmitUniqueName(std::vector<OperationExpression> expression) {
-  std::stringstream ret;
-  ret << "fused_kernel";
-  for (size_t i = 0; i < expression.size(); i++) {
-    ret << expression[i].GetOutputId();
-  }
-  return ret.str();
-}
-
-// we get the parameter list code for the expression information
-static std::string EmitDeclarationCode(
-    std::vector<OperationExpression> expression, std::string type) {
-  std::stringstream ret;
-
-  std::set<int> input_ids;
-  std::set<int> output_ids;
-
-  for (size_t i = 0; i < expression.size(); i++) {
-    std::vector<int> tmp_input = expression[i].GetInputIds();
-    for (size_t j = 0; j < tmp_input.size(); j++) {
-      int id = tmp_input[j];
-      input_ids.insert(id);
-    }
-    int tmp_output = expression[i].GetOutputId();
-    output_ids.insert(tmp_output);
-  }
-
-  std::set<int>::iterator it = input_ids.begin();
-  while (it != input_ids.end()) {
-    int var_index = *it;
-    if (output_ids.find(var_index) != output_ids.end()) {
-      input_ids.erase(it++);
-    } else {
-      it++;
-    }
-  }
-
-  ret << "int N, ";
-  for (it = input_ids.begin(); it != input_ids.end(); it++) {
-    int var_index = *it;
-    ret << type << R"(* var)" << var_index;
-    ret << ", ";
-  }
-
-  size_t count_index = 0;
-  for (it = output_ids.begin(); it != output_ids.end(); it++) {
-    int var_index = *it;
-    ret << type << R"(* var)" << var_index;
-    if (count_index != output_ids.size() - 1) {
-      ret << ", ";
-    }
-    count_index++;
-  }
-
-  return ret.str();
-}
-
-static std::string EmitComputeCode(
-    std::vector<OperationExpression> expression) {
-  // get the right experssion code using suffix expression
-  std::stringstream ret;
-  for (size_t i = 0; i < expression.size(); i++) {
-    ret << expression[i].GetExpression();
-  }
-  return ret.str();
-}
-
-static const char kernel_function[] = R"(
+static const char predefined_cuda_functions[] = R"(
 __device__ float real_exp(float x) { return ::expf(x); }
-
 __device__ double real_exp(double x) { return ::exp(x); }

 __device__ float real_log(float x) { return ::logf(x); }
-
 __device__ double real_log(double x) { return ::log(x); }

 __device__ float real_min(float x, float y) { return ::fminf(x, y); }
-
 __device__ double real_min(double x, double y) { return ::fmin(x, y); }

 __device__ float real_max(float x, float y) { return ::fmaxf(x, y); }
-
 __device__ double real_max(double x, double y) { return ::fmax(x, y); }

 )";

-static const char kernel_elementwise_template[] = R"(
+static const char elementwise_cuda_template[] = R"(

-extern "C" __global__ void $name($parameter){
+extern "C" __global__ void $func_name($parameters) {
  for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
      idx < N;
      idx += gridDim.x * blockDim.x) {
-      $compute
-}
+    $compute_body
+  }
 }
 )";

+}  // namespace fusion_group
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -17,125 +17,176 @@ limitations under the License. */
 #include <cmath>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
+#include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/init.h"

 #ifdef PADDLE_WITH_CUDA
-TEST(code_generator, cuda) {
-  std::vector<int> mul_input{1, 2};
-  std::vector<int> add_input{3, 4};
-  std::vector<int> sub_input{5, 6};
-  std::vector<int> relu_input{7};
-  std::vector<int> sigmoid_input{8};
-
-  int mul_out = 3;
-  int add_out = 5;
-  int sub_out = 7;
-  int relu_out = 8;
-  int sigmoid_out = 9;
-
-  std::string op1 = "elementwise_mul";
-  std::string op2 = "elementwise_add";
-  std::string op3 = "elementwise_sub";
-  std::string op4 = "relu";
-  std::string op5 = "sigmoid";
-  paddle::framework::ir::OperationExpression opexp1(mul_input, mul_out, op1);
-  paddle::framework::ir::OperationExpression opexp2(add_input, add_out, op2);
-  paddle::framework::ir::OperationExpression opexp3(sub_input, sub_out, op3);
-  paddle::framework::ir::OperationExpression opexp4(relu_input, relu_out, op4);
-  paddle::framework::ir::OperationExpression opexp5(sigmoid_input, sigmoid_out,
-                                                    op5);
-
-  std::vector<paddle::framework::ir::OperationExpression> fused_op = {
-      opexp1, opexp2, opexp3, opexp4, opexp5};
-  paddle::framework::ir::CodeTemplate code_template(
-      paddle::framework::ir::kernel_elementwise_template);
-  paddle::framework::ir::CodeGenerator codegen(code_template);
-  paddle::framework::ir::TemplateVariable template_var;
-  template_var.Add("$name", EmitUniqueName(fused_op));
-  template_var.Add("$parameter", EmitDeclarationCode(fused_op, "float"));
-  template_var.Add("$compute", EmitComputeCode(fused_op));
-  std::string saxpy_code = codegen.GenerateCode(template_var);
-
-  std::cout << saxpy_code << std::endl;
+namespace fusion_group = paddle::framework::ir::fusion_group;
+
+template <typename T>
+void SetupRandomCPUTensor(paddle::framework::LoDTensor* tensor) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* ptr = tensor->data<T>();
+  PADDLE_ENFORCE_NOT_NULL(
+      ptr, "Call mutable_data to alloc memory for Tensor first.");
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    ptr[i] = static_cast<T>(uniform_dist(rng)) - static_cast<T>(0.5);
+  }
+}
+
+void TestMain(std::string func_name,
+              std::vector<fusion_group::OperationExpression> expressions,
+              std::vector<paddle::framework::LoDTensor> cpu_tensors, int n,
+              std::vector<int> input_ids, std::vector<int> output_ids) {
+  fusion_group::OperationMap::Init();
+  fusion_group::CodeGenerator code_generator;
+  std::string code_str = code_generator.GenerateCode(func_name, expressions);
+  VLOG(3) << code_str;
+
  paddle::framework::InitDevices(false, {0});
  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceCode code(place, EmitUniqueName(fused_op),
-                                        saxpy_code);
-
-  paddle::framework::Tensor cpu_a;
-  paddle::framework::Tensor cpu_b;
-  paddle::framework::Tensor cpu_c;
-  paddle::framework::Tensor cpu_d;
-  paddle::framework::Tensor cpu_e;
-  paddle::framework::Tensor cpu_f;
-  paddle::framework::Tensor cpu_g;
-  paddle::framework::Tensor cpu_h;
-  paddle::framework::Tensor cpu_o;
+  paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
+  device_code.Compile();
+
+  std::vector<paddle::framework::LoDTensor> gpu_tensors(cpu_tensors.size());
+
+  std::vector<float*> gpu_ptrs(gpu_tensors.size());
+  std::vector<void*> args;
+  args.push_back(&n);
+
+  for (size_t i = 0; i < input_ids.size(); ++i) {
+    gpu_ptrs[input_ids[i]] = gpu_tensors[input_ids[i]].mutable_data<float>(
+        cpu_tensors[input_ids[i]].dims(), place);
+    args.push_back(&gpu_ptrs[input_ids[i]]);
+
+    SetupRandomCPUTensor<float>(&cpu_tensors[input_ids[i]]);
+    TensorCopySync(cpu_tensors[input_ids[i]], place,
+                   &gpu_tensors[input_ids[i]]);
+  }
+
+  for (size_t i = 0; i < output_ids.size(); ++i) {
+    gpu_ptrs[output_ids[i]] = gpu_tensors[output_ids[i]].mutable_data<float>(
+        cpu_tensors[output_ids[i]].dims(), place);
+    args.push_back(&gpu_ptrs[output_ids[i]]);
+  }
+
+  device_code.SetNumThreads(1024);
+  device_code.SetWorkloadPerThread(1);
+  device_code.Launch(n, &args);
+
+  auto* dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+      paddle::platform::DeviceContextPool::Instance().Get(place));
+  dev_ctx->Wait();
+
+  for (size_t i = 0; i < output_ids.size(); ++i) {
+    TensorCopySync(gpu_tensors[output_ids[i]], paddle::platform::CPUPlace(),
+                   &cpu_tensors[output_ids[i]]);
+  }
+}
+
+TEST(code_generator, elementwise) {
+  // t2 = t0 * t1
+  // t4 = t2 + t3
+  // t6 = t4 - t5
+  // t7 = relu(t6)
+  // t8 = sigmoid(t7)
+  fusion_group::OperationExpression exp1("elementwise_mul", {0, 1}, {2});
+  fusion_group::OperationExpression exp2("elementwise_add", {2, 3}, {4});
+  fusion_group::OperationExpression exp3("elementwise_sub", {4, 5}, {6});
+  fusion_group::OperationExpression exp4("relu", {6}, {7});
+  fusion_group::OperationExpression exp5("sigmoid", {7}, {8});
+
+  std::vector<fusion_group::OperationExpression> expressions = {
+      exp1, exp2, exp3, exp4, exp5};
+
+  // Prepare CPU tensors
+  std::vector<paddle::framework::LoDTensor> cpu_tensors(9);
+  std::vector<int> input_ids = {0, 1, 3, 5};
+  std::vector<int> output_ids = {2, 4, 6, 7, 8};

  auto dims = paddle::framework::make_ddim(
      {static_cast<int64_t>(256), static_cast<int64_t>(1024)});
-  cpu_a.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_b.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_c.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_d.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_e.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_f.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_g.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_o.mutable_data<float>(dims, paddle::platform::CPUPlace());
-
-  size_t n = cpu_a.numel();
-  for (size_t i = 0; i < n; ++i) {
-    cpu_a.data<float>()[i] = static_cast<float>(i);
+  for (size_t i = 0; i < cpu_tensors.size(); ++i) {
+    cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace());
  }
-  for (size_t i = 0; i < n; ++i) {
-    cpu_b.data<float>()[i] = static_cast<float>(0.5);
-    cpu_d.data<float>()[i] = static_cast<float>(10.0);
-    cpu_f.data<float>()[i] = static_cast<float>(0.0);
+
+  int n = cpu_tensors[0].numel();
+  TestMain("fused_elementwise_0", expressions, cpu_tensors, n, input_ids,
+           output_ids);
+
+  auto cpu_kernel_handler = [&](float* var0, float* var1, float* var3,
+                                float* var5, int i) -> float {
+    float var2_i = var0[i] * var1[i];
+    float var4_i = var2_i + var3[i];
+    float var6_i = var4_i - var5[i];
+    float var7_i = var6_i > 0.0 ? var6_i : 0.0;
+    float var8_i = 1.0 / (1.0 + std::exp(-var7_i));
+    return var8_i;
+  };
+
+  // Check the results
+  for (int i = 0; i < n; i++) {
+    float result = cpu_kernel_handler(
+        cpu_tensors[0].data<float>(), cpu_tensors[1].data<float>(),
+        cpu_tensors[3].data<float>(), cpu_tensors[5].data<float>(), i);
+    PADDLE_ENFORCE_LT(fabs(cpu_tensors[8].data<float>()[i] - result), 1.E-05);
+  }
+}
+
+TEST(code_generator, elementwise_grad) {
+  // The var order: t0, t1, t2, t3, t0', t1', t2', t3'
+  // t2 = t0 * t1
+  // t3 = relu(t2)
+  // t2' = relu_grad(t2, t3, t3')
+  // t0', t1' = elementwise_mul_grad(t0, t1, t2, t2')
+  fusion_group::OperationExpression exp1("relu_grad", {2, 3, 7}, {6});
+  fusion_group::OperationExpression exp2("elementwise_mul_grad", {0, 1, 2, 6},
+                                         {4, 5});
+
+  std::vector<fusion_group::OperationExpression> expressions = {exp1, exp2};
+
+  // Prepare CPU tensors
+  std::vector<paddle::framework::LoDTensor> cpu_tensors(8);
+  std::vector<int> input_ids = {0, 1, 2, 3, 7};
+  std::vector<int> output_ids = {4, 5, 6};
+
+  auto dims = paddle::framework::make_ddim(
+      {static_cast<int64_t>(256), static_cast<int64_t>(1024)});
+  for (size_t i = 0; i < cpu_tensors.size(); ++i) {
+    cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace());
  }

-  paddle::framework::Tensor a;
-  paddle::framework::Tensor b;
-  paddle::framework::Tensor c;
-  paddle::framework::Tensor d;
-  paddle::framework::Tensor e;
-  paddle::framework::Tensor f;
-  paddle::framework::Tensor g;
-  paddle::framework::Tensor h;
-  paddle::framework::Tensor o;
-
-  float* a_data = a.mutable_data<float>(dims, place);
-  float* b_data = b.mutable_data<float>(dims, place);
-  float* c_data = c.mutable_data<float>(dims, place);
-  float* d_data = d.mutable_data<float>(dims, place);
-  float* e_data = e.mutable_data<float>(dims, place);
-  float* f_data = f.mutable_data<float>(dims, place);
-  float* g_data = g.mutable_data<float>(dims, place);
-  float* h_data = h.mutable_data<float>(dims, place);
-  float* o_data = o.mutable_data<float>(dims, place);
-
-  TensorCopySync(cpu_a, place, &a);
-  TensorCopySync(cpu_b, place, &b);
-  TensorCopySync(cpu_d, place, &d);
-  TensorCopySync(cpu_f, place, &f);
-
-  code.Compile();
-
-  std::vector<void*> args = {&n,      &a_data, &b_data, &d_data, &f_data,
-                             &c_data, &e_data, &g_data, &h_data, &o_data};
-  code.SetNumThreads(1024);
-  code.SetWorkloadPerThread(1);
-  code.Launch(n, &args);
-
-  TensorCopySync(o, paddle::platform::CPUPlace(), &cpu_o);
-  for (size_t i = 0; i < n; i++) {
-    float result =
-        (1.0 / (1.0 + std::exp(-std::max(
-                          0.0, static_cast<float>(i) * 0.5 + 10.0 - 0.0))));
-    PADDLE_ENFORCE_EQ(cpu_o.data<float>()[i], result);
+  int n = cpu_tensors[0].numel();
+  TestMain("fused_elementwise_grad_0", expressions, cpu_tensors, n, input_ids,
+           output_ids);
+
+  auto cpu_kernel_handler = [&](float* var0, float* var1, float* var2,
+                                float* var3, float* var7,
+                                int i) -> std::vector<float> {
+    float var6_i = var2[i] > 0 ? var7[i] : 0;
+    float var4_i = var6_i * var1[i];
+    float var5_i = var6_i * var0[i];
+    return std::vector<float>{var4_i, var5_i, var6_i};
+  };
+
+  // Check the results
+  for (int i = 0; i < n; i++) {
+    std::vector<float> results = cpu_kernel_handler(
+        cpu_tensors[0].data<float>(), cpu_tensors[1].data<float>(),
+        cpu_tensors[2].data<float>(), cpu_tensors[3].data<float>(),
+        cpu_tensors[7].data<float>(), i);
+    PADDLE_ENFORCE_LT(fabs(cpu_tensors[4].data<float>()[i] - results[0]),
+                      1.E-05);
+    PADDLE_ENFORCE_LT(fabs(cpu_tensors[5].data<float>()[i] - results[1]),
+                      1.E-05);
+    PADDLE_ENFORCE_LT(fabs(cpu_tensors[6].data<float>()[i] - results[2]),
+                      1.E-05);
  }
 }
 #endif
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h"
+#include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"

 namespace paddle {
@@ -20,12 +21,22 @@ namespace framework {
 namespace ir {
 namespace fusion_group {

-static std::unordered_set<std::string> binary_op_types = {
-    "elementwise_add", "elementwise_sub", "elementwise_mul",
-    "elementwise_div", "elementwise_min", "elementwise_max"};
+static std::unordered_set<std::string> binary_op_types;
+static std::unordered_set<std::string> unary_op_types;

-static std::unordered_set<std::string> unary_op_types = {"relu", "sigmoid",
-                                                         "tanh"};
+static std::unordered_set<std::string>& GetBinaryOpTypes() {
+  if (binary_op_types.empty()) {
+    binary_op_types = OperationMap::Instance().Find(0, 2);
+  }
+  return binary_op_types;
+}
+
+static std::unordered_set<std::string>& GetUnaryOpTypes() {
+  if (unary_op_types.empty()) {
+    unary_op_types = OperationMap::Instance().Find(0, 1);
+  }
+  return unary_op_types;
+}

 static bool IsSpecifiedOp(const std::unordered_set<std::string>& op_types,
                          Node* n) {
@@ -39,7 +50,7 @@ static bool IsSpecifiedOp(const std::unordered_set<std::string>& op_types,
 }

 static bool IsBinaryOp(Node* n) {
-  if (IsSpecifiedOp(binary_op_types, n) && n->inputs.size() == 2U) {
+  if (IsSpecifiedOp(GetBinaryOpTypes(), n) && n->inputs.size() == 2U) {
    auto* x = n->inputs[0];
    auto* y = n->inputs[1];

@@ -64,7 +75,7 @@ static bool IsBinaryOp(Node* n) {
  return false;
 }

-static bool IsUnaryOp(Node* n) { return IsSpecifiedOp(unary_op_types, n); }
+static bool IsUnaryOp(Node* n) { return IsSpecifiedOp(GetUnaryOpTypes(), n); }

 bool ElementwiseGroupDetector::IsElementwiseOp(Node* n) {
  return IsBinaryOp(n) || IsUnaryOp(n);

--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
@@ -25,7 +25,7 @@ namespace framework {
 namespace ir {
 namespace fusion_group {

-struct ElementwiseGroupDetector {
+class ElementwiseGroupDetector {
 public:
  int operator()(Node* n);


--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"

 namespace paddle {
@@ -22,6 +23,8 @@ namespace framework {
 namespace ir {

 TEST(FusionGroupPass, elementwise_list) {
+  fusion_group::OperationMap::Init();
+
  // inputs                     operator            output
  // --------------------------------------------------------
  // (x, y)                     mul              -> tmp_0
@@ -69,6 +72,8 @@ TEST(FusionGroupPass, elementwise_list) {
 }

 TEST(FusionGroupPass, elementwise_tree) {
+  fusion_group::OperationMap::Init();
+
  // inputs                     operator            output
  // --------------------------------------------------------
  // (x0, y0)                   mul              -> tmp_0

--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/fusion_group/operation.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace fusion_group {
+
+OperationMap* OperationMap::map = nullptr;
+
+OperationMap::OperationMap() {
+  InsertUnaryElementwiseOperations();
+  InsertBinaryElementwiseOperations();
+}
+
+std::unordered_set<std::string> OperationMap::Find(int type, int num_operands) {
+  std::unordered_set<std::string> res;
+  for (auto& t : operations_) {
+    if ((t.second.type == type) &&
+        (num_operands < 0 || t.second.num_operands == num_operands)) {
+      res.insert(t.first);
+    }
+  }
+  return res;
+}
+
+void OperationMap::Insert(int type, int num_operands, std::string op_type,
+                          std::string expr,
+                          std::vector<std::string> grad_exprs) {
+  Operation op(type, num_operands, op_type, {expr});
+  PADDLE_ENFORCE_EQ(op.IsValid(), true, "Operation %s is invalid.", op_type);
+  operations_[op_type] = op;
+
+  if (grad_exprs.size() > 0U) {
+    std::string grad_op_type = op_type + "_grad";
+    Operation grad_op(type, num_operands, grad_op_type, grad_exprs);
+    PADDLE_ENFORCE_EQ(grad_op.IsValid(), true, "Operation %s is invalid.",
+                      grad_op_type);
+    operations_[grad_op_type] = grad_op;
+  }
+}
+
+void OperationMap::InsertUnaryElementwiseOperations() {
+  int type = 0;
+  int num_oprands = 1;
+  // For unary elementwise operations:
+  //  ${0} - x
+  //  ${1} - out
+  //  ${2} - dout
+
+  // relu:
+  //  out = f(x) = x > 0 ? x : 0
+  //  dx = dout * (out > 0 ? 1 : 0) = dout * (x > 0 ? 1 : 0)
+  Insert(type, num_oprands, "relu", "real_max(${0}, 0)",
+         {"${0} > 0 ? ${2} : 0"});
+  // sigmoid:
+  //  out = f(x) = 1.0 / (1.0 + exp(-x))
+  //  dx = dout * out * (1 - out)
+  Insert(type, num_oprands, "sigmoid", "1.0 / (1.0 + real_exp(- ${0}))",
+         {"${2} * ${1} * (1.0 - ${1})"});
+  // tanh:
+  //  out = f(x) = 2.0 / (1.0 + exp(-2.0 * x)) - 1.0;
+  //  dx = dout * (1 - out * out)
+  Insert(type, num_oprands, "tanh", "2.0 / (1.0 + real_exp(-2.0 * ${0})) - 1.0",
+         {"${2} * (1.0 - ${1} * ${1})"});
+}
+
+void OperationMap::InsertBinaryElementwiseOperations() {
+  int type = 0;
+  int num_oprands = 2;
+  // For binary elementwise oprations:
+  //  ${0} - x
+  //  ${1} - y
+  //  ${2} - out
+  //  ${3} - dout
+
+  // elementwise_add:
+  //  out = x + y
+  //  dx = dout * 1
+  //  dy = dout * 1
+  Insert(type, num_oprands, "elementwise_add", "${0} + ${1}", {"${3}", "${3}"});
+  // elementwise_sub:
+  //  out = x - y
+  //  dx = dout * 1
+  //  dy = dout * (-1)
+  Insert(type, num_oprands, "elementwise_sub", "${0} - ${1}",
+         {"${3}", "- ${3}"});
+  // elementwise_mul:
+  //  out = x * y
+  //  dx = dout * y
+  //  dy = dout * x
+  Insert(type, num_oprands, "elementwise_mul", "${0} * ${1}",
+         {"${3} * ${1}", "${3} * ${0}"});
+  Insert(type, num_oprands, "elementwise_div", "${0} / ${1}", {});
+  Insert(type, num_oprands, "elementwise_min", "real_min(${0}, ${1})", {});
+  Insert(type, num_oprands, "elementwise_max", "real_max(${0}, ${1})", {});
+}
+
+}  // namespace fusion_group
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace fusion_group {
+
+struct Operation {
+  Operation() {}
+  Operation(int t, int n, std::string o, std::vector<std::string> e)
+      : type(t), num_operands(n), op_type(o), exprs(e) {}
+
+  bool IsGradOp() {
+    std::string suffix = "_grad";
+    return op_type.rfind(suffix) == (op_type.length() - suffix.length());
+  }
+
+  bool IsValid() {
+    if (!IsGradOp() && exprs.size() != 1U) {
+      return false;
+    }
+    if (IsGradOp() && exprs.size() != static_cast<size_t>(num_operands)) {
+      return false;
+    }
+    return true;
+  }
+
+  int type;
+  int num_operands;
+  std::string op_type;
+  std::vector<std::string> exprs;
+};
+
+class OperationMap {
+ public:
+  OperationMap();
+
+  static OperationMap& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(map, "Need to initialize OperationMap first!");
+    return *map;
+  }
+
+  static OperationMap& Init() {
+    if (map == nullptr) {
+      map = new OperationMap();
+    }
+    return *map;
+  }
+
+  std::unordered_set<std::string> Find(int type, int num_operands = -1);
+
+  bool Has(std::string op_type) {
+    return operations_.find(op_type) != operations_.end();
+  }
+
+  Operation& Get(std::string op_type) {
+    auto iter = operations_.find(op_type);
+    PADDLE_ENFORCE_NE(iter, operations_.end(),
+                      "Operation %s is not supported yet.", op_type);
+    return iter->second;
+  }
+
+ private:
+  void Insert(int type, int num_operands, std::string op_type, std::string expr,
+              std::vector<std::string> grad_exprs);
+
+  void InsertUnaryElementwiseOperations();
+  void InsertBinaryElementwiseOperations();
+
+ private:
+  static OperationMap* map;
+  std::unordered_map<std::string, Operation> operations_;
+  DISABLE_COPY_AND_ASSIGN(OperationMap);
+};
+
+}  // namespace fusion_group
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle