diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 4bb36263319cd805daf2d1f6c757b0db52d72ffe..05b0c1cc34e4302315779b035130f4fa019805e3 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library(code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph) if(NOT APPLE AND NOT WIN32) if(WITH_GPU) - cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor) + cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass) endif() endif() diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index 19d2303a42e00d35d68d5652aca8610ea256c483..b2a6aec0ad282d50c504f79067e4ceb4372d5a2a 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/code_generator.h" -#include #include +#include #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h" +#include "paddle/fluid/framework/ir/fusion_group/operation.h" namespace paddle { namespace framework { @@ -30,69 +31,205 @@ CodeGenerator::CodeGenerator() { code_templates_[0] = elementwise_t; } +std::string CodeGenerator::Generate(SubGraph* subgraph) { + std::vector expressions = ConvertToExpressions(subgraph); + return Generate(subgraph->func_name, expressions); +} + +std::vector CodeGenerator::ConvertToExpressions( + SubGraph* subgraph) { + std::unordered_map var_ids = EncodeVarNodes(subgraph); + std::vector expressions; + for (auto* node : subgraph->SortedNodes()) { + if (node && node->IsOp() && node->Op()) { + auto* op = node->Op(); + + // Input ids should be set in fixed order, like: + // - x, y in forward operations + // - x, y, out, out@GRAD in backward operations + std::vector input_ids; + std::vector input_names = + OperationMap::Instance().Get(op->Type()).input_names; + for (auto& name : input_names) { + // TODO(liuyiqun): support duplicated input. + if (op->Input(name).size() >= 1U) { + // Some input vars are not used in grad ops, such as + // "elementwise_add_grad", where "X", "Y" and "Out" are not used. + PADDLE_ENFORCE_NE(var_ids.find(op->Input(name)[0]), var_ids.end(), + "Input(%s) of operation %s should be set.", name, + op->Type()); + input_ids.push_back(var_ids[op->Input(name)[0]]); + } else { + input_ids.push_back(-1); + } + } + // Output ids should be set in fixed order, like: + // - dx, dy in backward operations + std::vector output_ids; + std::vector output_names = + OperationMap::Instance().Get(op->Type()).output_names; + for (auto& name : output_names) { + PADDLE_ENFORCE_EQ(op->Output(name).size(), 1U, + "Output(%s) of operation %s should be set.", name, + op->Type()); + PADDLE_ENFORCE_NE(var_ids.find(op->Output(name)[0]), var_ids.end(), + "Output(%s) of operation %s should be set.", name, + op->Type()); + output_ids.push_back(var_ids[op->Output(name)[0]]); + } + expressions.push_back( + OperationExpression(node->Name(), input_ids, output_ids)); + } + } + return expressions; +} + // In order to get the right result of expression, we need to calculate and // store the expression as suffix Expressions using vector. -std::string CodeGenerator::GenerateCode( +std::string CodeGenerator::Generate( std::string func_name, std::vector expressions) { - // Check whether all expressions are elementwise operations. + // TODO(liuyiqun): Check whether all expressions are elementwise operations. + std::string dtype = "float"; + std::set input_ids = DistilInputIds(expressions); + std::set output_ids = DistilOutputIds(expressions); + TemplateVariable template_var; template_var.Add("func_name", func_name); - template_var.Add("parameters", EmitParameters(expressions, "float")); - template_var.Add("compute_body", EmitComputeBody(expressions)); + template_var.Add("parameters", EmitParameters(input_ids, output_ids, dtype)); + template_var.Add("compute_body", + EmitComputeBody(expressions, input_ids, output_ids, dtype)); return predefined_cuda_functions + code_templates_[0].Format(template_var); } -// we get the parameter list code for the expression information -std::string CodeGenerator::EmitParameters( - std::vector expressions, std::string dtype) { +std::set CodeGenerator::DistilInputIds( + const std::vector& expressions) { std::set input_ids; - std::set output_ids; - // Remove the reptead id and get a ordered list. + // Use std::set to remove the reptead id and get a ordered list. for (size_t i = 0; i < expressions.size(); i++) { for (auto id : expressions[i].GetInputIds()) { - input_ids.insert(id); + if (id >= 0) { + input_ids.insert(id); + } } + } + return input_ids; +} + +std::set CodeGenerator::DistilOutputIds( + const std::vector& expressions) { + std::set output_ids; + // Use std::set to remove the reptead id and get a ordered list. + for (size_t i = 0; i < expressions.size(); i++) { for (auto id : expressions[i].GetOutputIds()) { output_ids.insert(id); } } + return output_ids; +} + +// we get the parameter list code for the expression information +std::string CodeGenerator::EmitParameters(const std::set& input_ids, + const std::set& output_ids, + std::string dtype) { + std::stringstream ret; + ret << "int N, "; // If a id is in the input and output list at the same time, then remove it // from the input list. - for (auto iter = input_ids.begin(); iter != input_ids.end();) { - if (output_ids.find(*iter) != output_ids.end()) { - input_ids.erase(iter++); - } else { - iter++; + for (auto id : input_ids) { + if (output_ids.find(id) == output_ids.end()) { + ret << dtype << "* " << ArgName(id) << ", "; } } - std::stringstream ret; - ret << "int N, "; - for (auto iter = input_ids.begin(); iter != input_ids.end(); iter++) { - ret << dtype << "* " << VarName(*iter) << ", "; - } - - size_t count_index = 0; - for (auto iter = output_ids.begin(); iter != output_ids.end(); iter++) { - ret << dtype << "* " << VarName(*iter); - if (count_index != output_ids.size() - 1) { + size_t index = 0; + for (auto id : output_ids) { + ret << dtype << "* " << ArgName(id); + if (index != output_ids.size() - 1) { ret << ", "; } - count_index++; + index++; } return ret.str(); } std::string CodeGenerator::EmitComputeBody( - std::vector expressions) { - // get the right experssion code using suffix expression - std::stringstream ret; + const std::vector& expressions, + const std::set& input_ids, const std::set& output_ids, + std::string dtype) { + std::ostringstream compute; + std::unordered_set used; for (size_t i = 0; i < expressions.size(); i++) { - ret << expressions[i].GetExpression(); + VLOG(3) << DebugString(expressions[i]); + compute << expressions[i].GetExpression(dtype, &used); } - return ret.str(); + + // Load input to temporal variables. + std::ostringstream load; + for (auto id : input_ids) { + if (output_ids.find(id) == output_ids.end() && + used.find(id) != used.end()) { + load << dtype << " " << TmpName(id) << " = " << ArgName(id) << "[idx];"; + } + } + + // Store temporal variables to memory. + std::ostringstream store; + for (auto id : output_ids) { + store << ArgName(id) << "[idx] = " << TmpName(id) << ";"; + } + + return load.str() + compute.str() + store.str(); +} + +std::unordered_map CodeGenerator::EncodeVarNodes( + SubGraph* subgraph) { + const auto& input_var_nodes = subgraph->GetInputVarNodes(); + const auto& output_var_nodes = subgraph->GetOutputVarNodes(); + + int id = 0; + std::unordered_map var_ids; + // Numbering input vars. + for (auto* in : input_var_nodes) { + VLOG(3) << "Encoding input names:" << in->Name() << ", id:" << id; + if (var_ids.find(in->Name()) == var_ids.end()) { + var_ids[in->Name()] = id++; + } + } + // Numbering internal vars. + for (auto* node : subgraph->SortedNodes()) { + if (node && node->IsVar() && node->Var()) { + bool is_found = false; + for (auto* in : input_var_nodes) { + if (node == in) { + is_found = true; + break; + } + } + if (is_found) { + continue; + } + for (auto* out : output_var_nodes) { + if (node == out) { + is_found = true; + break; + } + } + PADDLE_ENFORCE_EQ( + is_found, true, + "Subgraph with internal var nodes (%s) is not supported yet.", + node->Name()); + } + } + // Encoding output vars. + for (auto* out : output_var_nodes) { + VLOG(3) << "Ecoding output names:" << out->Name() << ", id:" << id; + if (var_ids.find(out->Name()) == var_ids.end()) { + var_ids[out->Name()] = id++; + } + } + return var_ids; } } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.h b/paddle/fluid/framework/ir/fusion_group/code_generator.h index 6505e669e14343ae00f1759cf1e898e1472aef59..22d666111827d0d143dab640c82b6dbe9399b4a1 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.h +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h" +#include "paddle/fluid/framework/ir/fusion_group/subgraph.h" namespace paddle { namespace framework { @@ -27,18 +30,31 @@ class CodeGenerator { public: CodeGenerator(); - std::string GenerateCode(std::string func_name, - std::vector expressions); + std::string Generate(std::string func_name, + std::vector expressions); - // TODO(wangchao): add a more general interface - // std::string Generate(const std::string name, const SubGraph& subgraph); + std::string Generate(SubGraph* subgraph); + + std::vector ConvertToExpressions(SubGraph* subgraph); private: + std::set DistilInputIds( + const std::vector& expressions); + std::set DistilOutputIds( + const std::vector& expressions); + // we get the parameter list code for the expression information - std::string EmitParameters(std::vector expressions, + std::string EmitParameters(const std::set& input_ids, + const std::set& output_ids, std::string dtype); - std::string EmitComputeBody(std::vector expressions); + std::string EmitComputeBody( + const std::vector& expressions, + const std::set& input_ids, const std::set& output_ids, + std::string dtype); + + // Encode all var nodes in the subgraph with an unique number. + std::unordered_map EncodeVarNodes(SubGraph* subgraph); private: std::vector code_templates_; diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc index fd4b31cf9f13748f0293dc56f114ec984e9cb4bb..38d5ebde87e3208ae3111dd591db9d5a76de4d84 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc @@ -33,8 +33,9 @@ static T StringTo(const std::string& str) { return value; } -std::string OperationExpression::GetRHS(size_t i) { - auto rhs = OperationMap::Instance().Get(op_).exprs[i]; +std::string OperationExpression::GetRHS(std::unordered_set* used, + size_t i) const { + auto rhs = OperationMap::Instance().Get(op_type_).exprs[i]; for (size_t i = 0; i < rhs.size(); i++) { size_t pos = i; if (rhs[pos] == '$' && rhs[pos + 1] == '{') { @@ -47,29 +48,33 @@ std::string OperationExpression::GetRHS(size_t i) { PADDLE_ENFORCE_LT(index, input_ids_.size(), "Only %d inputs are provided, but need %d.", input_ids_.size(), index + 1); - rhs.replace(pos, length + 3, VarName(input_ids_[index]) + R"([idx])"); + PADDLE_ENFORCE_GE(input_ids_[index], 0, + "Input id should be no less than 0."); + rhs.replace(pos, length + 3, TmpName(input_ids_[index])); + used->insert(input_ids_[index]); } } return rhs; } -std::string OperationExpression::GetLHS(size_t i) { +std::string OperationExpression::GetLHS(size_t i) const { std::stringstream ret; - ret << VarName(output_ids_[i]) << R"([idx])"; + ret << TmpName(output_ids_[i]); return ret.str(); } -bool OperationExpression::IsSupport() { - return OperationMap::Instance().Has(op_); +bool OperationExpression::IsSupport() const { + return OperationMap::Instance().Has(op_type_); } // we Traverse the graph and get the group , all input id and output id is // unique for the node which belong the group -std::string OperationExpression::GetExpression() { +std::string OperationExpression::GetExpression( + std::string dtype, std::unordered_set* used) const { std::stringstream ret; if (IsSupport()) { for (size_t i = 0; i < output_ids_.size(); ++i) { - ret << GetLHS(i) << " = " << GetRHS(i) << ";"; + ret << dtype << " " << GetLHS(i) << " = " << GetRHS(used, i) << ";"; } } diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h index a3df4937f2d4f8116e4d75a13c05ffdbf60ce5d7..ed4ce665d2b6ba43d17538c60e3517064e7430e4 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h @@ -14,10 +14,10 @@ limitations under the License. */ #pragma once -#include #include #include #include +#include #include #include "paddle/fluid/platform/enforce.h" @@ -27,28 +27,36 @@ namespace framework { namespace ir { namespace fusion_group { -static std::string VarName(int index) { return "var" + std::to_string(index); } +static inline std::string ArgName(int index) { + return "arg" + std::to_string(index); +} +static inline std::string TmpName(int index) { + return "tmp" + std::to_string(index); +} class OperationExpression { public: - explicit OperationExpression(std::string op, std::vector input_ids, + explicit OperationExpression(std::string op_type, std::vector input_ids, std::vector output_ids) - : op_(op), input_ids_(input_ids), output_ids_(output_ids) {} + : op_type_(op_type), input_ids_(input_ids), output_ids_(output_ids) {} - std::vector GetInputIds() { return input_ids_; } - std::vector GetOutputIds() { return output_ids_; } + std::string GetOpType() const { return op_type_; } + std::vector GetInputIds() const { return input_ids_; } + std::vector GetOutputIds() const { return output_ids_; } // Check whether this operation type is supported in OperationMap. - bool IsSupport(); + bool IsSupport() const; - std::string GetExpression(); + std::string GetExpression(std::string dtype, + std::unordered_set* used) const; + private: // TODO(wangchao): make offset more flexible we add stride and basic offset - std::string GetRHS(size_t i = 0); - std::string GetLHS(size_t i = 0); + std::string GetRHS(std::unordered_set* used, size_t i = 0) const; + std::string GetLHS(size_t i = 0) const; private: - std::string op_; + std::string op_type_; std::vector input_ids_; std::vector output_ids_; }; @@ -58,6 +66,7 @@ class TemplateVariable { void Add(std::string identifier, std::string expression) { strings_[identifier] = expression; } + void Remove(std::string identifier, std::string expression) { for (auto it = strings_.begin(); it != strings_.end();) { if (it->first == identifier) { @@ -155,7 +164,6 @@ __device__ double real_max(double x, double y) { return ::fmax(x, y); } )"; static const char elementwise_cuda_template[] = R"( - extern "C" __global__ void $func_name($parameters) { for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N; @@ -165,6 +173,28 @@ extern "C" __global__ void $func_name($parameters) { } )"; +static std::string DebugString(const OperationExpression& expr) { + std::stringstream ret; + ret << "Op(" << expr.GetOpType() << "), inputs:{"; + auto input_ids = expr.GetInputIds(); + for (size_t i = 0; i < input_ids.size(); ++i) { + if (i != 0) { + ret << ","; + } + ret << expr.GetInputIds()[i]; + } + ret << "}, outputs:{"; + auto output_ids = expr.GetOutputIds(); + for (size_t i = 0; i < output_ids.size(); ++i) { + if (i != 0) { + ret << ","; + } + ret << expr.GetOutputIds()[i]; + } + ret << "}"; + return ret.str(); +} + } // namespace fusion_group } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index dbe9af3cb723352f6d0c261af8f04ca6c60df0cc..bc9f2408b67cd1b1d47470795a5cc82e025c46c5 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -18,16 +18,133 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/ir/fusion_group/operation.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/init.h" #ifdef PADDLE_WITH_CUDA -namespace fusion_group = paddle::framework::ir::fusion_group; + +namespace paddle { +namespace framework { +namespace ir { +namespace fusion_group { + +// relu +inline float relu(float x) { return x > 0 ? x : 0.; } + +inline float relu_grad_dx(float x, float out, float dout) { + return x > 0 ? dout : 0; +} + +// sigmoid +inline float sigmoid(float x) { return 1.0 / (1.0 + std::exp(-x)); } + +inline float sigmoid_grad_dx(float x, float out, float dout) { + return dout * out * (1 - out); +} + +// tanh +inline float tanh(float x) { return 2.0 / (1.0 + std::exp(-2 * x)) - 1.0; } + +inline float tanh_grad_dx(float x, float out, float dout) { + return dout * (1.0 - out * out); +} + +// elementwise_add +inline float elementwise_add(float x, float y) { return x + y; } + +inline float elementwise_add_grad_dx(float x, float y, float out, float dout) { + return dout; +} + +inline float elementwise_add_grad_dy(float x, float y, float out, float dout) { + return dout; +} + +// elementwise_sub +inline float elementwise_sub(float x, float y) { return x - y; } + +inline float elementwise_sub_grad_dx(float x, float y, float out, float dout) { + return dout; +} + +inline float elementwise_sub_grad_dy(float x, float y, float out, float dout) { + return -dout; +} + +// elementwise_mul +inline float elementwise_mul(float x, float y) { return x * y; } + +inline float elementwise_mul_grad_dx(float x, float y, float out, float dout) { + return dout * y; +} + +inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) { + return dout * x; +} + +void CheckOutput(const std::vector& expressions, + const std::vector cpu_tensors, + const std::vector input_ids_of_subgraph, + const std::vector output_ids_of_subgraph, int i) { + std::vector var(cpu_tensors.size()); + for (auto id : input_ids_of_subgraph) { + if (id >= 0) { + var[id] = cpu_tensors[id].data()[i]; + } + } + + for (auto expression : expressions) { + std::string op_type = expression.GetOpType(); + auto input_ids = expression.GetInputIds(); + auto output_ids = expression.GetOutputIds(); + if (op_type == "relu") { + var[output_ids[0]] = relu(var[input_ids[0]]); + } else if (op_type == "sigmoid") { + var[output_ids[0]] = sigmoid(var[input_ids[0]]); + } else if (op_type == "tanh") { + var[output_ids[0]] = tanh(var[input_ids[0]]); + } else if (op_type == "elementwise_add") { + var[output_ids[0]] = + elementwise_add(var[input_ids[0]], var[input_ids[1]]); + } else if (op_type == "elementwise_sub") { + var[output_ids[0]] = + elementwise_sub(var[input_ids[0]], var[input_ids[1]]); + } else if (op_type == "elementwise_mul") { + var[output_ids[0]] = + elementwise_mul(var[input_ids[0]], var[input_ids[1]]); + } else if (op_type == "relu_grad") { + var[output_ids[0]] = + relu_grad_dx(var[input_ids[0]], 0, var[input_ids[2]]); + } else if (op_type == "sigmoid_grad") { + var[output_ids[0]] = + sigmoid_grad_dx(0, var[input_ids[1]], var[input_ids[2]]); + } else if (op_type == "tanh_grad") { + var[output_ids[0]] = + tanh_grad_dx(0, var[input_ids[1]], var[input_ids[2]]); + } else if (op_type == "elementwise_add_grad") { + var[output_ids[0]] = elementwise_add_grad_dx(0, 0, 0, var[input_ids[3]]); + var[output_ids[1]] = elementwise_add_grad_dy(0, 0, 0, var[input_ids[3]]); + } else if (op_type == "elementwise_mul_grad") { + var[output_ids[0]] = + elementwise_mul_grad_dx(0, var[input_ids[1]], 0, var[input_ids[3]]); + var[output_ids[1]] = + elementwise_mul_grad_dy(var[input_ids[0]], 0, 0, var[input_ids[3]]); + } + } + + for (auto id : output_ids_of_subgraph) { + float actual = cpu_tensors[id].data()[i]; + float expect = var[id]; + PADDLE_ENFORCE_LT(fabs(actual - expect), 1.E-05, + "Get %f vs %f (actual vs expect).", actual, expect); + } +} template -void SetupRandomCPUTensor(paddle::framework::LoDTensor* tensor) { +void SetupRandomCPUTensor(LoDTensor* tensor) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); @@ -40,15 +157,16 @@ void SetupRandomCPUTensor(paddle::framework::LoDTensor* tensor) { } } -void TestMain(std::string func_name, - std::vector expressions, - std::vector cpu_tensors, int n, - std::vector input_ids, std::vector output_ids) { - fusion_group::OperationMap::Init(); - fusion_group::CodeGenerator code_generator; - std::string code_str = code_generator.GenerateCode(func_name, expressions); - VLOG(3) << code_str; +} // namespace fusion_group +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace fusion_group = paddle::framework::ir::fusion_group; +void TestMainImpl(std::string func_name, std::string code_str, + std::vector cpu_tensors, int n, + std::vector input_ids, std::vector output_ids) { paddle::framework::InitDevices(false, {0}); paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceCode device_code(place, func_name, code_str); @@ -60,20 +178,20 @@ void TestMain(std::string func_name, std::vector args; args.push_back(&n); - for (size_t i = 0; i < input_ids.size(); ++i) { - gpu_ptrs[input_ids[i]] = gpu_tensors[input_ids[i]].mutable_data( - cpu_tensors[input_ids[i]].dims(), place); - args.push_back(&gpu_ptrs[input_ids[i]]); - - SetupRandomCPUTensor(&cpu_tensors[input_ids[i]]); - TensorCopySync(cpu_tensors[input_ids[i]], place, - &gpu_tensors[input_ids[i]]); + for (auto id : input_ids) { + if (id >= 0) { + gpu_ptrs[id] = + gpu_tensors[id].mutable_data(cpu_tensors[id].dims(), place); + fusion_group::SetupRandomCPUTensor(&cpu_tensors[id]); + TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]); + args.push_back(&gpu_ptrs[id]); + } } - for (size_t i = 0; i < output_ids.size(); ++i) { - gpu_ptrs[output_ids[i]] = gpu_tensors[output_ids[i]].mutable_data( - cpu_tensors[output_ids[i]].dims(), place); - args.push_back(&gpu_ptrs[output_ids[i]]); + for (auto id : output_ids) { + gpu_ptrs[id] = + gpu_tensors[id].mutable_data(cpu_tensors[id].dims(), place); + args.push_back(&gpu_ptrs[id]); } device_code.SetNumThreads(1024); @@ -84,12 +202,40 @@ void TestMain(std::string func_name, paddle::platform::DeviceContextPool::Instance().Get(place)); dev_ctx->Wait(); - for (size_t i = 0; i < output_ids.size(); ++i) { - TensorCopySync(gpu_tensors[output_ids[i]], paddle::platform::CPUPlace(), - &cpu_tensors[output_ids[i]]); + for (auto id : output_ids) { + TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(), + &cpu_tensors[id]); } } +void TestMain(std::string func_name, + std::vector expressions, + std::vector cpu_tensors, int n, + std::vector input_ids, std::vector output_ids) { + fusion_group::OperationMap::Init(); + fusion_group::CodeGenerator code_generator; + std::string code_str = code_generator.Generate(func_name, expressions); + VLOG(3) << code_str; + + TestMainImpl(func_name, code_str, cpu_tensors, n, input_ids, output_ids); +} + +std::vector TestMain( + fusion_group::SubGraph* subgraph, + std::vector cpu_tensors, int n, + std::vector input_ids, std::vector output_ids) { + fusion_group::OperationMap::Init(); + fusion_group::CodeGenerator code_generator; + std::string code_str = code_generator.Generate(subgraph); + VLOG(3) << code_str; + + TestMainImpl(subgraph->func_name, code_str, cpu_tensors, n, input_ids, + output_ids); + + // Need to check the accuracy according to expressions. + return code_generator.ConvertToExpressions(subgraph); +} + TEST(code_generator, elementwise) { // t2 = t0 * t1 // t4 = t2 + t3 @@ -101,41 +247,33 @@ TEST(code_generator, elementwise) { fusion_group::OperationExpression exp3("elementwise_sub", {4, 5}, {6}); fusion_group::OperationExpression exp4("relu", {6}, {7}); fusion_group::OperationExpression exp5("sigmoid", {7}, {8}); - std::vector expressions = { exp1, exp2, exp3, exp4, exp5}; // Prepare CPU tensors std::vector cpu_tensors(9); - std::vector input_ids = {0, 1, 3, 5}; - std::vector output_ids = {2, 4, 6, 7, 8}; - auto dims = paddle::framework::make_ddim( {static_cast(256), static_cast(1024)}); for (size_t i = 0; i < cpu_tensors.size(); ++i) { cpu_tensors[i].mutable_data(dims, paddle::platform::CPUPlace()); } + // Expressions: + // Op(elementwise_mul), inputs:{0,1}, outputs:{2} + // Op(elementwise_add), inputs:{2,3}, outputs:{4} + // Op(elementwise_sub), inputs:{4,5}, outputs:{6} + // Op(relu), inputs:{6}, outputs:{7} + // Op(sigmoid), inputs:{7}, outputs:{8} int n = cpu_tensors[0].numel(); - TestMain("fused_elementwise_0", expressions, cpu_tensors, n, input_ids, + std::vector input_ids = {0, 1, 3, 5}; + std::vector output_ids = {2, 4, 6, 7, 8}; + TestMain("elementwise_kernel_0", expressions, cpu_tensors, n, input_ids, output_ids); - auto cpu_kernel_handler = [&](float* var0, float* var1, float* var3, - float* var5, int i) -> float { - float var2_i = var0[i] * var1[i]; - float var4_i = var2_i + var3[i]; - float var6_i = var4_i - var5[i]; - float var7_i = var6_i > 0.0 ? var6_i : 0.0; - float var8_i = 1.0 / (1.0 + std::exp(-var7_i)); - return var8_i; - }; - // Check the results for (int i = 0; i < n; i++) { - float result = cpu_kernel_handler( - cpu_tensors[0].data(), cpu_tensors[1].data(), - cpu_tensors[3].data(), cpu_tensors[5].data(), i); - PADDLE_ENFORCE_LT(fabs(cpu_tensors[8].data()[i] - result), 1.E-05); + fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids, + i); } } @@ -145,48 +283,183 @@ TEST(code_generator, elementwise_grad) { // t3 = relu(t2) // t2' = relu_grad(t2, t3, t3') // t0', t1' = elementwise_mul_grad(t0, t1, t2, t2') - fusion_group::OperationExpression exp1("relu_grad", {2, 3, 7}, {6}); + fusion_group::OperationExpression exp1("relu_grad", {2, -1, 7}, {6}); fusion_group::OperationExpression exp2("elementwise_mul_grad", {0, 1, 2, 6}, {4, 5}); - std::vector expressions = {exp1, exp2}; // Prepare CPU tensors std::vector cpu_tensors(8); - std::vector input_ids = {0, 1, 2, 3, 7}; - std::vector output_ids = {4, 5, 6}; - auto dims = paddle::framework::make_ddim( {static_cast(256), static_cast(1024)}); for (size_t i = 0; i < cpu_tensors.size(); ++i) { cpu_tensors[i].mutable_data(dims, paddle::platform::CPUPlace()); } + // Expressions: + // Op(relu_grad), inputs:{2,3,7}, outputs:{6} + // Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5} int n = cpu_tensors[0].numel(); - TestMain("fused_elementwise_grad_0", expressions, cpu_tensors, n, input_ids, + std::vector input_ids = {0, 1, 2, -1, 7}; + std::vector output_ids = {4, 5, 6}; + TestMain("elementwise_grad_kernel_0", expressions, cpu_tensors, n, input_ids, output_ids); - auto cpu_kernel_handler = [&](float* var0, float* var1, float* var2, - float* var3, float* var7, - int i) -> std::vector { - float var6_i = var2[i] > 0 ? var7[i] : 0; - float var4_i = var6_i * var1[i]; - float var5_i = var6_i * var0[i]; - return std::vector{var4_i, var5_i, var6_i}; + // Check the results + for (int i = 0; i < n; i++) { + fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids, + i); + } +} + +std::unique_ptr BuildGraph( + bool backward = false) { + // inputs operator output + // -------------------------------------------------------- + // x0 sigmoid -> tmp_0 + // (tmp_0, x1) elementwise_mul -> tmp_1 + // x2 tanh -> tmp_2 + // (x3, tmp_2) elementwise_mul -> tmp_3 + // (tmp_1, tmp_3) elementwise_add -> tmp_4 + // + // Expression: tmp_4 = sigmoid(x0) * x1 + tanh(x2) * x3 + // The var order (their ids may be different): + // backward is false - x0(0), x1(1), x2(2), x3(3); + // - tmp_0(4), tmp_2(5), tmp_3(6), tmp_1(7), tmp_4(8) + // backward is true - tmp_1(0), tmp_4@GRAD(1), tmp_3(2), tmp_4(3), + // tmp_2(4), x3(5), x1(6), tmp_0(7), x0(8), x2(9) + // - tmp_3@GRAD(10), tmp_1@GRAD(11), tmp_0@GRAD(12), + // tmp_2@GRAD(13), x2@GRAD(14), x0@GRAD(15), + // x3@GRAD(16), x1@GRAD(17) + paddle::framework::ir::Layers layers; + auto* x0 = layers.data("x0", {16, 32}); + auto* tmp_0 = layers.sigmoid(x0); + tmp_0->SetShape({16, 32}); + auto* x1 = layers.data("x1", {16, 32}); + auto* tmp_1 = layers.elementwise_mul(tmp_0, x1); + tmp_1->SetShape({16, 32}); + auto* x2 = layers.data("x2", {16, 32}); + auto* tmp_2 = layers.tanh(x2); + tmp_2->SetShape({16, 32}); + auto* x3 = layers.data("x3", {16, 32}); + auto* tmp_3 = layers.elementwise_mul(x3, tmp_2); + tmp_3->SetShape({16, 32}); + layers.elementwise_add(tmp_1, tmp_3); + + if (backward) { + layers.backward(); + } + + std::unique_ptr graph( + new paddle::framework::ir::Graph(layers.main_program())); +#ifdef __clang__ + return graph; +#else + return std::move(graph); +#endif +} + +std::unordered_set DistilGradNodes( + const std::unique_ptr& graph) { + auto is_grad_op = [&](paddle::framework::ir::Node* n) -> bool { + if (n && n->IsOp() && n->Op()) { + std::string suffix = "_grad"; + std::string op_type = n->Op()->Type(); + size_t pos = op_type.rfind(suffix); + return pos != std::string::npos && + pos == (op_type.length() - suffix.length()); + } + return false; }; + std::unordered_set grad_nodes; + for (auto* n : graph->Nodes()) { + if (is_grad_op(n)) { + grad_nodes.insert(n); + } else if (n && n->IsVar() && n->Var()) { + // Remove forward op nodes from inputs + std::vector inputs; + for (auto* in : n->inputs) { + if (in && in->IsOp() && in->Op() && is_grad_op(in)) { + inputs.push_back(in); + } + } + n->inputs = inputs; + // Remove forward op nodes from outputs + std::vector outputs; + for (auto* out : n->outputs) { + if (out && out->IsOp() && out->Op() && is_grad_op(out)) { + outputs.push_back(out); + } + } + n->outputs = outputs; + grad_nodes.insert(n); + } + } + return grad_nodes; +} + +TEST(code_generator, subgraph) { + std::unique_ptr graph = BuildGraph(false); + fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true, + graph->Nodes()); + + // Prepare CPU tensors + std::vector cpu_tensors(9); + auto dims = paddle::framework::make_ddim( + {static_cast(256), static_cast(1024)}); + for (size_t i = 0; i < cpu_tensors.size(); ++i) { + cpu_tensors[i].mutable_data(dims, paddle::platform::CPUPlace()); + } + + // Expressions generated by code_generator (they may be different): + // Op(sigmoid), inputs:{0}, outputs:{4} + // Op(elementwise_mul), inputs:{4,1}, outputs:{7} + // Op(tanh), inputs:{2}, outputs:{5} + // Op(elementwise_mul), inputs:{3,5}, outputs:{6} + // Op(elementwise_add), inputs:{7,6}, outputs:{8} + int n = cpu_tensors[0].numel(); + std::vector input_ids = {0, 1, 2, 3}; + std::vector output_ids = {4, 5, 6, 7, 8}; + std::vector expressions = + TestMain(&subgraph, cpu_tensors, n, input_ids, output_ids); + + // Check the results + for (int i = 0; i < n; i++) { + fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids, + i); + } +} + +TEST(code_generator, subgraph_grad) { + std::unique_ptr graph = BuildGraph(true); + fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true, + DistilGradNodes(graph)); + + // Prepare CPU tensors + std::vector cpu_tensors(18); + auto dims = paddle::framework::make_ddim( + {static_cast(256), static_cast(1024)}); + for (size_t i = 0; i < cpu_tensors.size(); ++i) { + cpu_tensors[i].mutable_data(dims, paddle::platform::CPUPlace()); + } + + // Expressions generated by code_generator (they may be different): + // Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10} + // Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13} + // Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15} + // Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16} + // Op(tanh_grad), inputs:{9,4,13}, outputs:{14} + int n = cpu_tensors[0].numel(); + std::vector input_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector output_ids = {10, 11, 12, 13, 14, 15, 16, 17}; + std::vector expressions = + TestMain(&subgraph, cpu_tensors, n, input_ids, output_ids); + // Check the results for (int i = 0; i < n; i++) { - std::vector results = cpu_kernel_handler( - cpu_tensors[0].data(), cpu_tensors[1].data(), - cpu_tensors[2].data(), cpu_tensors[3].data(), - cpu_tensors[7].data(), i); - PADDLE_ENFORCE_LT(fabs(cpu_tensors[4].data()[i] - results[0]), - 1.E-05); - PADDLE_ENFORCE_LT(fabs(cpu_tensors[5].data()[i] - results[1]), - 1.E-05); - PADDLE_ENFORCE_LT(fabs(cpu_tensors[6].data()[i] - results[2]), - 1.E-05); + fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids, + i); } } #endif diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc index c9e999085696197fded51c9a6fd8a10db08a54b9..fb6f7b8d746505c0f52298ece980fffe880b1cb8 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc @@ -108,13 +108,6 @@ bool ElementwiseGroupDetector::IsOutputOfElementwiseOp(Node* n) { return false; } -void ElementwiseGroupDetector::Insert(Node* n) { - if (subgraph_.nodes_set.find(n) == subgraph_.nodes_set.end()) { - VLOG(5) << "Insert " << n->Name() << " to subgraph " << name_; - subgraph_.nodes_set.insert(n); - } -} - int ElementwiseGroupDetector::Search(Node* n, std::vector except_nodes) { std::unordered_set except_nodes_set; for (size_t i = 0; i < except_nodes.size(); ++i) { @@ -123,16 +116,16 @@ int ElementwiseGroupDetector::Search(Node* n, std::vector except_nodes) { int num_operations = 0; if (IsElementwiseOp(n)) { - Insert(n); + subgraph_.Insert(n); num_operations += 1; for (auto* var : n->inputs) { - Insert(var); + subgraph_.Insert(var); if (except_nodes_set.find(var) == except_nodes_set.end()) { num_operations += Search(var, {n}); } } for (auto* var : n->outputs) { - Insert(var); + subgraph_.Insert(var); if (except_nodes_set.find(var) == except_nodes_set.end()) { num_operations += Search(var, {n}); } @@ -157,7 +150,7 @@ int ElementwiseGroupDetector::Search(Node* n, std::vector except_nodes) { int ElementwiseGroupDetector::operator()(Node* n) { if (!IsOutputOfElementwiseOp(n) && IsInputOfElementwiseOp(n, "X")) { name_ = n->Name(); - Insert(n); + subgraph_.Insert(n); num_operations_ = Search(n, n->inputs); VLOG(4) << "Detect elementwise subgraph begin with " << name_ << ", " << num_operations_ << " operations, " << GetSubgraph().GetNumNodes() diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h index 33cf982321ed7996de78be1992ed75407e403a52..49d472eaab870015e2c69b021d3b1f99ff1f8774 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h @@ -36,7 +36,6 @@ class ElementwiseGroupDetector { bool IsInputOfElementwiseOp(Node* n, std::string name = ""); bool IsOutputOfElementwiseOp(Node* n); - void Insert(Node* n); int Search(Node* n, std::vector except_nodes = {}); private: diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc index 9f7dd15f62d8102c04f034eecd1c8ac71782d236..1f06e46c4744c3f8cf545b0baca8b228beb07f04 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc @@ -36,7 +36,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { for (Node* n : all_nodes) { bool is_found = false; for (auto& subgraph : subgraphs) { - if (subgraph.nodes_set.find(n) != subgraph.nodes_set.end()) { + if (subgraph.Has(n)) { is_found = true; break; } @@ -61,15 +61,17 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { // TODO(liuyiqun): check whether there are intersection between subgraphs for (size_t i = 0; i < subgraphs.size(); ++i) { - InsertFusionGroupOp(graph, subgraphs[i]); + InsertFusionGroupOp(graph, &subgraphs[i]); } return subgraphs.size(); } void FusionGroupPass::InsertFusionGroupOp( - Graph* graph, const fusion_group::SubGraph& subgraph) const { - std::vector input_vars_of_subgraph = subgraph.GetInputVarNodes(); - std::vector output_vars_of_subgraph = subgraph.GetOutputVarNodes(); + Graph* graph, fusion_group::SubGraph* subgraph) const { + const std::vector& input_vars_of_subgraph = + subgraph->GetInputVarNodes(); + const std::vector& output_vars_of_subgraph = + subgraph->GetOutputVarNodes(); std::unordered_set external_nodes; OpDesc op_desc; @@ -88,8 +90,8 @@ void FusionGroupPass::InsertFusionGroupOp( external_nodes.insert(n); } op_desc.SetOutput("Outs", output_names); - op_desc.SetAttr("type", subgraph.type); - op_desc.SetAttr("func_name", subgraph.func_name); + op_desc.SetAttr("type", subgraph->type); + op_desc.SetAttr("func_name", subgraph->func_name); auto fusion_group_node = graph->CreateOpNode(&op_desc); for (auto* in : input_vars_of_subgraph) { @@ -100,7 +102,7 @@ void FusionGroupPass::InsertFusionGroupOp( } std::unordered_set internal_nodes; - for (auto* n : subgraph.nodes_set) { + for (auto* n : subgraph->Nodes()) { if (external_nodes.find(n) == external_nodes.end()) { internal_nodes.insert(n); } diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h index c61db8f9ea0a060d1116da09ae16eba7bfa44561..8bdddf8877c06d15bd3ce23d6dd60af4189f40f6 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h @@ -30,7 +30,7 @@ class FusionGroupPass : public Pass { private: int DetectFusionGroup(Graph* graph, int type = 0) const; void InsertFusionGroupOp(Graph* graph, - const fusion_group::SubGraph& subgraph) const; + fusion_group::SubGraph* subgraph) const; const std::string name_scope_{"fusion_group"}; }; diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc index 0da8022089b79e0f264fc50cacc16331c183a7ec..a023897d121ec3df019c8f796fa98b56f5d69b43 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc @@ -22,6 +22,14 @@ namespace paddle { namespace framework { namespace ir { +void VisualizeGraph(std::unique_ptr graph, std::string graph_viz_path) { + // Insert a graph_viz_pass to transform the graph to a .dot file. + // It can be used for debug. + auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass"); + graph_viz_pass->Set("graph_viz_path", new std::string(graph_viz_path)); + graph.reset(graph_viz_pass->Apply(graph.release())); +} + TEST(FusionGroupPass, elementwise_list) { fusion_group::OperationMap::Init(); @@ -46,29 +54,17 @@ TEST(FusionGroupPass, elementwise_list) { layers.elementwise_add(tmp_2, w); std::unique_ptr graph(new Graph(layers.main_program())); - - // The following codes is to insert a graph_viz_pass to transform the graph to - // a .dot file. It is used for debug. - // auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass"); - // graph_viz_pass->Set("graph_viz_path", new - // std::string("00_elementwise_list.dot")); - // graph.reset(graph_viz_pass->Apply(graph.release())); + // VisualizeGraph(graph, "00_elementwise_list.dot"); auto fusion_group_pass = PassRegistry::Instance().Get("fusion_group_pass"); VLOG(3) << DebugString(graph); graph.reset(fusion_group_pass->Apply(graph.release())); + // VisualizeGraph(graph, "01_elementwise_list.fusion_group.dot"); int num_fusion_group_ops = GetNumOpNodes(graph, "fusion_group"); VLOG(3) << DebugString(graph); PADDLE_ENFORCE_EQ(num_fusion_group_ops, 1); - - // The following codes is to insert a graph_viz_pass to transform the graph to - // a .dot file. It is used for debug. - // auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass"); - // graph_viz_pass->Set("graph_viz_path", new - // std::string("01_elementwise_list.fusion_group.dot")); - // graph.reset(graph_viz_pass->Apply(graph.release())); } TEST(FusionGroupPass, elementwise_tree) { @@ -128,29 +124,17 @@ TEST(FusionGroupPass, elementwise_tree) { layers.mul(tmp_6, tmp_9); std::unique_ptr graph(new Graph(layers.main_program())); - - // The following codes is to insert a graph_viz_pass to transform the graph to - // a .dot file. It is used for debug. - // auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass"); - // graph_viz_pass->Set("graph_viz_path", new - // std::string("00_elementwise_tree.dot")); - // graph.reset(graph_viz_pass->Apply(graph.release())); + // VisualizeGraph(graph, "00_elementwise_tree.dot"); auto fusion_group_pass = PassRegistry::Instance().Get("fusion_group_pass"); - LOG(INFO) << DebugString(graph); + VLOG(3) << DebugString(graph); graph.reset(fusion_group_pass->Apply(graph.release())); + // VisualizeGraph(graph, "01_elementwise_tree.fusion_group.dot"); int num_fusion_group_ops = GetNumOpNodes(graph, "fusion_group"); - LOG(INFO) << DebugString(graph); + VLOG(3) << DebugString(graph); PADDLE_ENFORCE_EQ(num_fusion_group_ops, 2); - - // The following codes is to insert a graph_viz_pass to transform the graph to - // a .dot file. It is used for debug. - // auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass"); - // graph_viz_pass->Set("graph_viz_path", new - // std::string("01_elementwise_tree.fusion_group.dot")); - // graph.reset(graph_viz_pass->Apply(graph.release())); } } // namespace ir diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc index 41344dbaa76c73b0b72b578b48640fcb4e4bc735..ad89cc13cb2a1c65b2005f530454c0aab2bfa467 100644 --- a/paddle/fluid/framework/ir/fusion_group/operation.cc +++ b/paddle/fluid/framework/ir/fusion_group/operation.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/operation.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace framework { @@ -38,15 +39,30 @@ std::unordered_set OperationMap::Find(int type, int num_operands) { } void OperationMap::Insert(int type, int num_operands, std::string op_type, - std::string expr, - std::vector grad_exprs) { - Operation op(type, num_operands, op_type, {expr}); + std::string expr, std::vector grad_exprs, + std::vector input_names, + std::vector output_names) { + Operation op(type, num_operands, op_type, {expr}, input_names, output_names); PADDLE_ENFORCE_EQ(op.IsValid(), true, "Operation %s is invalid.", op_type); operations_[op_type] = op; if (grad_exprs.size() > 0U) { std::string grad_op_type = op_type + "_grad"; - Operation grad_op(type, num_operands, grad_op_type, grad_exprs); + // grad_inputs = inputs + outputs + grad of outputs + std::vector grad_input_names = input_names; + for (auto name : output_names) { + grad_input_names.push_back(name); + } + for (auto name : output_names) { + grad_input_names.push_back(GradVarName(name)); + } + // grad_output = grad of inputs + std::vector grad_output_names; + for (auto name : input_names) { + grad_output_names.push_back(GradVarName(name)); + } + Operation grad_op(type, num_operands, grad_op_type, grad_exprs, + grad_input_names, grad_output_names); PADDLE_ENFORCE_EQ(grad_op.IsValid(), true, "Operation %s is invalid.", grad_op_type); operations_[grad_op_type] = grad_op; @@ -54,59 +70,65 @@ void OperationMap::Insert(int type, int num_operands, std::string op_type, } void OperationMap::InsertUnaryElementwiseOperations() { - int type = 0; - int num_oprands = 1; // For unary elementwise operations: // ${0} - x // ${1} - out // ${2} - dout + auto insert_handler = [&](std::string op_type, std::string expr, + std::vector grad_exprs) { + int type = 0; + int num_oprands = 1; + Insert(type, num_oprands, op_type, expr, grad_exprs, {"X"}, {"Out"}); + }; // relu: // out = f(x) = x > 0 ? x : 0 // dx = dout * (out > 0 ? 1 : 0) = dout * (x > 0 ? 1 : 0) - Insert(type, num_oprands, "relu", "real_max(${0}, 0)", - {"${0} > 0 ? ${2} : 0"}); + insert_handler("relu", "real_max(${0}, 0)", {"${0} > 0 ? ${2} : 0"}); // sigmoid: // out = f(x) = 1.0 / (1.0 + exp(-x)) // dx = dout * out * (1 - out) - Insert(type, num_oprands, "sigmoid", "1.0 / (1.0 + real_exp(- ${0}))", - {"${2} * ${1} * (1.0 - ${1})"}); + insert_handler("sigmoid", "1.0 / (1.0 + real_exp(- ${0}))", + {"${2} * ${1} * (1.0 - ${1})"}); // tanh: // out = f(x) = 2.0 / (1.0 + exp(-2.0 * x)) - 1.0; // dx = dout * (1 - out * out) - Insert(type, num_oprands, "tanh", "2.0 / (1.0 + real_exp(-2.0 * ${0})) - 1.0", - {"${2} * (1.0 - ${1} * ${1})"}); + insert_handler("tanh", "2.0 / (1.0 + real_exp(-2.0 * ${0})) - 1.0", + {"${2} * (1.0 - ${1} * ${1})"}); } void OperationMap::InsertBinaryElementwiseOperations() { - int type = 0; - int num_oprands = 2; // For binary elementwise oprations: // ${0} - x // ${1} - y // ${2} - out // ${3} - dout + auto insert_handler = [&](std::string op_type, std::string expr, + std::vector grad_exprs) { + int type = 0; + int num_oprands = 2; + Insert(type, num_oprands, op_type, expr, grad_exprs, {"X", "Y"}, {"Out"}); + }; // elementwise_add: // out = x + y // dx = dout * 1 // dy = dout * 1 - Insert(type, num_oprands, "elementwise_add", "${0} + ${1}", {"${3}", "${3}"}); + insert_handler("elementwise_add", "${0} + ${1}", {"${3}", "${3}"}); // elementwise_sub: // out = x - y // dx = dout * 1 // dy = dout * (-1) - Insert(type, num_oprands, "elementwise_sub", "${0} - ${1}", - {"${3}", "- ${3}"}); + insert_handler("elementwise_sub", "${0} - ${1}", {"${3}", "- ${3}"}); // elementwise_mul: // out = x * y // dx = dout * y // dy = dout * x - Insert(type, num_oprands, "elementwise_mul", "${0} * ${1}", - {"${3} * ${1}", "${3} * ${0}"}); - Insert(type, num_oprands, "elementwise_div", "${0} / ${1}", {}); - Insert(type, num_oprands, "elementwise_min", "real_min(${0}, ${1})", {}); - Insert(type, num_oprands, "elementwise_max", "real_max(${0}, ${1})", {}); + insert_handler("elementwise_mul", "${0} * ${1}", + {"${3} * ${1}", "${3} * ${0}"}); + insert_handler("elementwise_div", "${0} / ${1}", {}); + insert_handler("elementwise_min", "real_min(${0}, ${1})", {}); + insert_handler("elementwise_max", "real_max(${0}, ${1})", {}); } } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/fusion_group/operation.h b/paddle/fluid/framework/ir/fusion_group/operation.h index cd738aa777fde0636ee601ce2f319dbe1c10f5fc..3145e3052efb6b5bbd0fc25e75f77e9590b9b146 100644 --- a/paddle/fluid/framework/ir/fusion_group/operation.h +++ b/paddle/fluid/framework/ir/fusion_group/operation.h @@ -26,20 +26,32 @@ namespace ir { namespace fusion_group { struct Operation { - Operation() {} - Operation(int t, int n, std::string o, std::vector e) - : type(t), num_operands(n), op_type(o), exprs(e) {} + Operation() = default; + Operation(int t, int n, std::string o, std::vector e, + std::vector i_n, std::vector o_n) + : type(t), + num_operands(n), + op_type(o), + exprs(e), + input_names(i_n), + output_names(o_n) {} bool IsGradOp() { std::string suffix = "_grad"; - return op_type.rfind(suffix) == (op_type.length() - suffix.length()); + size_t pos = op_type.rfind(suffix); + return pos != std::string::npos && + pos == (op_type.length() - suffix.length()); } bool IsValid() { if (!IsGradOp() && exprs.size() != 1U) { + // When it is a forward operation, it should hold only one expression (for + // only one output). return false; } if (IsGradOp() && exprs.size() != static_cast(num_operands)) { + // When it is a backward opertion, it should hold a expression for each + // operand. return false; } return true; @@ -49,6 +61,8 @@ struct Operation { int num_operands; std::string op_type; std::vector exprs; + std::vector input_names; + std::vector output_names; }; class OperationMap { @@ -83,7 +97,9 @@ class OperationMap { private: void Insert(int type, int num_operands, std::string op_type, std::string expr, - std::vector grad_exprs); + std::vector grad_exprs, + std::vector input_names, + std::vector output_names); void InsertUnaryElementwiseOperations(); void InsertBinaryElementwiseOperations(); diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h index a1f06d55410fb872d1e04439137e74489ca72281..10f2df77462d08e91b8626daf3b61a27a8747d95 100644 --- a/paddle/fluid/framework/ir/fusion_group/subgraph.h +++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include "paddle/fluid/framework/ir/fusion_group/operation.h" #include "paddle/fluid/framework/ir/node.h" namespace paddle { @@ -27,12 +29,35 @@ namespace fusion_group { struct SubGraph { int type{-1}; std::string func_name; - std::unordered_set nodes_set; + bool save_intermediate_out{false}; + + SubGraph() = default; + SubGraph(int t, std::string f, bool s, const std::unordered_set& n) + : type(t), func_name(f), save_intermediate_out(s), nodes_set(n) {} bool IsEmpty() { return nodes_set.empty(); } + const std::unordered_set& Nodes() const { return nodes_set; } + + const std::vector& SortedNodes() { + if (!is_sorted) { + Sort(); + } + return sorted_nodes; + } + size_t GetNumNodes() { return nodes_set.size(); } + bool Has(Node* n) { return nodes_set.find(n) != nodes_set.end(); } + + void Insert(Node* n) { + if (nodes_set.find(n) == nodes_set.end()) { + VLOG(5) << "Insert " << n->Name() << " to subgraph " << this; + nodes_set.insert(n); + is_sorted = false; + } + } + int GetNumOperations() { int num_operations = 0; for (auto* n : nodes_set) { @@ -43,11 +68,10 @@ struct SubGraph { return num_operations; } - std::vector GetInputVarNodes() const { - // The order of input nodes should be consistent with that of the generated - // code. + std::vector GetInputVarNodes() { + // The order of input nodes should be consistent anywhere. std::vector input_vars; - for (auto* n : nodes_set) { + for (auto* n : SortedNodes()) { if (n && n->IsVar() && n->Var()) { bool is_found = true; // When the inputs size is 0, it is also considered the input var of @@ -57,7 +81,7 @@ struct SubGraph { } // Normally a var node has only one input op node. for (auto* in : n->inputs) { - if (nodes_set.find(in) == nodes_set.end()) { + if (!Has(in)) { is_found = false; } } @@ -69,28 +93,197 @@ struct SubGraph { return input_vars; } - std::vector GetOutputVarNodes() const { - // The order of output nodes should be consistant with that of the generated - // code. + std::vector GetOutputVarNodes() { + // The order of output nodes should be consistant anywhere.. std::vector output_vars; + for (auto* n : SortedNodes()) { + if (n && n->IsVar() && n->Var()) { + if (save_intermediate_out) { + // If the var_node is the output of some op_node in the subgraph, it + // is considered the output var node of the subgraph. + bool is_found = false; + for (auto* in : n->inputs) { + if (Has(in)) { + is_found = true; + } + } + if (is_found) { + output_vars.push_back(n); + } + } else { + // If one of the var_node's outputs is the input of some operator + // outside the subgraph, it is considered the output var node of the + // subgraph. + bool is_found = true; + if (n->outputs.size() == 0U) { + is_found = false; + } + for (auto* out : n->outputs) { + if (!Has(out)) { + is_found = false; + } + } + if (!is_found) { + output_vars.push_back(n); + } + } + } + } + return output_vars; + } + + private: + int FindIndexInSortedNodes(Node* n) { + for (size_t i = 0; i < sorted_nodes.size(); ++i) { + if (n == sorted_nodes[i]) { + return static_cast(i); + } + } + return -1; + } + + void SortVarsBasedOnSortedOps() { + // Insert var nodes to sorted_nodes. + std::unordered_map sorted_vars; for (auto* n : nodes_set) { if (n && n->IsVar() && n->Var()) { - bool is_found = true; - if (n->outputs.size() == 0U) { - is_found = false; + int from = 0; + int to = sorted_nodes.size(); + + for (auto* in : n->inputs) { + if (in && in->IsOp() && in->Op()) { + int index = FindIndexInSortedNodes(in); + // Insert after input op node + if (index >= 0) { + from = index + 1 > from ? index + 1 : from; + } + } } + for (auto* out : n->outputs) { - if (nodes_set.find(out) == nodes_set.end()) { - is_found = false; + if (out && out->IsOp() && out->Op()) { + int index = FindIndexInSortedNodes(out); + // Insert before output op node + if (index >= 0) { + to = index < to ? index : to; + } } } - if (!is_found) { - output_vars.push_back(n); + + PADDLE_ENFORCE_LE(from, to, "Range [%d, %d] is invalid.", from, to); + sorted_nodes.insert(sorted_nodes.begin() + to, n); + sorted_vars[n->Name()] = n; + } + } + } + + std::vector SortedOps() { + Node* start_op_n = nullptr; + std::unordered_set ops; + for (auto* op_n : nodes_set) { + if (op_n && op_n->IsOp() && op_n->Op()) { + // Initialize ops to all ops in the subgraph. + ops.insert(op_n); + + if (!start_op_n) { + // Find start op node whose inputs are produced outside the subgraph. + bool is_found = false; + for (auto* prev_op_n : GetPrevOpNodes(op_n)) { + if (Has(prev_op_n)) { + is_found = true; + break; + } + } + if (!is_found) { + start_op_n = op_n; + } } } } - return output_vars; + + std::vector sorted_ops; + sorted_ops.push_back(start_op_n); + ops.erase(start_op_n); + while (ops.size() > 0U) { + std::unordered_set erased_ops; + for (auto* op_n : ops) { + bool found_connected_ops = false; + int from = 1; + int to = sorted_ops.size(); + std::unordered_set prev_op_nodes = GetPrevOpNodes(op_n); + std::unordered_set next_op_nodes = GetNextOpNodes(op_n); + for (int i = sorted_ops.size(); i >= 0; --i) { + if (prev_op_nodes.find(sorted_ops[i]) != prev_op_nodes.end()) { + // Insert after i (i + 1) + found_connected_ops = true; + from = (i + 1 > from) ? i + 1 : from; + } + if (next_op_nodes.find(sorted_ops[i]) != next_op_nodes.end()) { + // Insert before i + found_connected_ops = true; + to = (i < to) ? i : to; + } + } + if (found_connected_ops) { + PADDLE_ENFORCE_LE(from, to, "Range [%d, %d] is invalid.", from, to); + sorted_ops.insert(sorted_ops.begin() + to, op_n); + erased_ops.insert(op_n); + } + } + PADDLE_ENFORCE_GT(erased_ops.size(), 0U); + for (auto* op_n : erased_ops) { + ops.erase(op_n); + } + } + return sorted_ops; + } + + std::unordered_set GetPrevOpNodes(Node* op_n) { + PADDLE_ENFORCE_EQ(op_n && op_n->IsOp() && op_n->Op(), true, + "Node %p is not a op node.", op_n); + + std::unordered_set prev_op_nodes; + for (auto* in_var : op_n->inputs) { + if (in_var && in_var->IsVar() && in_var->Var()) { + for (auto* prev_op_n : in_var->inputs) { + if (prev_op_n && prev_op_n->IsOp() && prev_op_n->Op()) { + prev_op_nodes.insert(prev_op_n); + } + } + } + } + return prev_op_nodes; + } + + std::unordered_set GetNextOpNodes(Node* op_n) { + PADDLE_ENFORCE_EQ(op_n && op_n->IsOp() && op_n->Op(), true, + "Node %p is not a op node.", op_n); + + std::unordered_set next_op_nodes; + for (auto* out_var : op_n->outputs) { + if (out_var && out_var->IsVar() && out_var->Var()) { + for (auto* next_op_n : out_var->outputs) { + if (next_op_n && next_op_n->IsOp() && next_op_n->Op()) { + next_op_nodes.insert(next_op_n); + } + } + } + } + return next_op_nodes; } + + void Sort() { + if (!is_sorted) { + sorted_nodes = SortedOps(); + SortVarsBasedOnSortedOps(); + } + is_sorted = true; + } + + private: + std::unordered_set nodes_set; + bool is_sorted{false}; + std::vector sorted_nodes; }; } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h index 2709b6e2facc9b32f3ead1547b383000894247b2..a89f0c3b661db7d9b9794d6edb792654846d09f5 100644 --- a/paddle/fluid/framework/ir/pass_tester_helper.h +++ b/paddle/fluid/framework/ir/pass_tester_helper.h @@ -19,7 +19,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" namespace paddle { namespace framework { @@ -267,6 +270,47 @@ struct Layers { return outs; } + void backward() { + BlockDesc* block = program_.MutableBlock(0); + std::vector forward_ops = block->AllOps(); + for (int i = forward_ops.size() - 1; i >= 0; --i) { + OpDesc* op = forward_ops[i]; + OpDesc* grad_op = block->AppendOp(); + grad_op->SetType(op->Type() + "_grad"); + // All op's inputs are grad_op's input. + for (auto name : op->InputNames()) { + grad_op->SetInput(name, op->Input(name)); + } + // All op's outputs are grad_op's input. + for (auto name : op->OutputNames()) { + grad_op->SetInput(name, op->Output(name)); + } + // All op's outputs grad are grad_op's input. + for (auto name : op->OutputNames()) { + std::vector grad_var_names; + for (auto var_name : op->Output(name)) { + VarDesc* var = block->FindVar(var_name); + VarDesc* grad_var = + lod_tensor(GradVarName(var_name), var->GetShape(), false); + grad_var_names.push_back(grad_var->Name()); + } + grad_op->SetInput(GradVarName(name), grad_var_names); + } + // All op's inputs grad are grad_op's output. + for (auto name : op->InputNames()) { + std::vector grad_var_names; + for (auto var_name : op->Input(name)) { + VarDesc* var = block->FindVar(var_name); + VarDesc* grad_var = + lod_tensor(GradVarName(var_name), var->GetShape(), false); + grad_var_names.push_back(grad_var->Name()); + } + grad_op->SetOutput(GradVarName(name), grad_var_names); + } + // TODO(liuyiqun): attrs + } + } + private: VarDesc* lod_tensor(std::string name, std::vector shape = {}, bool is_persistable = false) { @@ -412,7 +456,7 @@ static std::string DebugString(Node* node) { return os.str(); } -static std::string DebugString(const std::unordered_set& nodes) { +static std::string DebugString(const std::vector& nodes) { std::ostringstream os; for (auto* node : nodes) { if (node->IsOp() && node->Op()) { @@ -425,6 +469,14 @@ static std::string DebugString(const std::unordered_set& nodes) { return os.str(); } +static std::string DebugString(const std::unordered_set& nodes) { + std::vector vec; + for (auto* node : nodes) { + vec.push_back(node); + } + return DebugString(vec); +} + static std::string DebugString(const std::unique_ptr& graph) { std::ostringstream os; os << "Graph: {\n" << DebugString(graph->Nodes()) << "}\n";