diff --git a/.gitignore b/.gitignore index 801790d0a472080af607e9fbcde0284902a4ead8..664c45b7202f6bf93712062ffa1d003b575afffd 100644 --- a/.gitignore +++ b/.gitignore @@ -52,12 +52,12 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. -paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/pd/ir/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json tools/infrt/kernel_signature.json -paddle/infrt/dialect/pd_ops_info.h +paddle/infrt/dialect/pd/common/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 3a2ec403c0a59aaa23decc72fb9581b5a7f78343..9c4089af092e418d6845864671124917c6498cf1 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/errors.h" #include "glog/logging.h" - +DECLARE_bool(retain_grad_for_all_tensor); namespace egr { static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, @@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } std::vector> GradNodeAccumulation:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( @@ -62,7 +62,7 @@ operator()( grad_out = grads[0][0]; } - if (!weak_grad_.expired()) { + if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) { auto grad = weak_grad_.lock(); CopyOrAddTensor(grad.get(), grad_out); } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 07fa40165167ce2352018c0e1b1cb08222d5a181..a91a0b6e34c0d9440e3645d1a6982748c4315962 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } std::string name() { return "GradNodeAccumulation"; } diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 5a2595b9103e4d49845fa8938ee3577b6b3f3f06..0bc998a03a80b7b8a1e486ad68f1575c130d2c1b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X( void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } std::vector> GradNodeScale:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { // 1. Check Output Size PADDLE_ENFORCE( ((grads.size() == 1) && (grads[0].size() == 1)), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index 247fde6ed1f869542969b068cdae9f59cedd732a..e263f73a6b8a4a1f9ce23d9b5ca383fd6828016b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } void SetTensorWrappers_X( const std::vector& tensors); diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index bf838b27615028167e35a8e85e7636dd4c834016..d9f201dc9f1e8b9a0296288917b82f3e2903330e 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents( const char* GRAD_FUNCTION_TEMPLATE = "std::vector> " "GradNode%s::operator()(const " - "std::vector>& grads) {\n%s\n}"; + "std::vector>& grads, " + "bool create_graph) {\n%s\n}"; std::string grad_function_str = paddle::string::Sprintf( GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); @@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents( "\n" " virtual std::vector> " "operator()(const " - "std::vector>& grads) " + "std::vector>& grads, const " + "bool create_graph = false) " "override;\n" "\n" + " void ClearTensorWrappers() override { \n" + "%s\n" + " is_tensor_wrappers_cleared = true;\n" + " }\n" " std::string name() override { return \" GradNode%s \"; } \n " "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" + " bool IsTensorWrappersCleared() override { \n" + " return is_tensor_wrappers_cleared;\n" + " }\n" " private:\n" " // TensorWrappers\n" "%s\n" + " bool is_tensor_wrappers_cleared = false;\n" + "\n" " // Attribute Map\n" "%s\n" "};"; @@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents( std::string set_tensor_wrappers_str = ""; std::string tensor_wrapper_members_str = ""; + std::string clear_tensor_wrappers_str = ""; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents( SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, struct_tensor_wrapper_name); + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = + "for (auto tw: %s) {\n" + " tw.clear();\n" + " }\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); + } else { const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; @@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents( TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = - "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);"; + "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, tensor_wrapper_name, full_reserved_str); + + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); } std::string full_reserved_signature_str = "bool full_reserved"; const char* SET_TENSOR_WRAPPER_TEMPLATE = @@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents( std::string grad_node_str = paddle::string::Sprintf( GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, - op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, - tensor_wrapper_members_str, attr_members_str); + op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str, + set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d2d699e154f91d21e94e9d2dac3f703069d041e5..4c1e5b00cbaf6fd0688663c9dac756832e44dc4a 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, # SetTensorWrapper Methods & TensorWrapper Members set_tensor_wrapper_methods_str = "" tensor_wrapper_members_str = "" + clear_tensor_wrapper_str = "" for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): if tname in no_need_buffer_set: no_need_buffer = "true" @@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + {}.clear(); +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + else: assert IsVectorTensorType(ttype) SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ @@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + for (auto tw: {}) { + tw.clear(); + }; +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + # End: SetTensorWrapper Methods & TensorWrapper Members # SetAttributes & Attribute Members @@ -524,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, for aname, atype, default_val, _ in backward_attrs_list: saved_attr_name = GetSavedName(aname) SET_ATTR_METHOD_TEMPLATE = """ - void SetAttribute{}({} {}) {{ + void SetAttribute{}({} {}) {{ {} = {}; }} """ @@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{ ~{}() override = default; virtual std::vector> operator()( - const std::vector>& grads) override; + const std::vector>& grads, bool create_graph = false) override; std::string name() override {{ return \" {} \"; }} + + void ClearTensorWrappers() override {{ + {} + is_tensor_wrappers_cleared = true; + }} + // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes {} + + bool IsTensorWrappersCleared() override {{ + return is_tensor_wrappers_cleared; + }} private: // TensorWrappers {} + bool is_tensor_wrappers_cleared = false; + // Attributes {} }}; """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - grad_node_name, set_tensor_wrapper_methods_str, - set_attribute_methods_str, tensor_wrapper_members_str, - attribute_members_str) + grad_node_name, clear_tensor_wrapper_str, + set_tensor_wrapper_methods_str, set_attribute_methods_str, + tensor_wrapper_members_str, attribute_members_str) return node_declaration_str @@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, grad_api_namespace = f"paddle::experimental" FUNCTION_TEMPLATE = """ -std::vector> {}::operator()(const std::vector>& grads) {{ +std::vector> {}::operator()(const std::vector>& grads, bool create_graph) {{ // Call grad_api function auto grad_api_returns = {}::{}({}); {} diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 1987d024d8f3e34121f54962c45f0f8c1e91b723..f2d5f338bd4af6ce1f9858d35485a0b23dc5f61c 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -39,12 +39,21 @@ std::unordered_map getInDegreeMap( // Copy nodes std::queue queue = init_queue; std::unordered_set visited; + size_t potential_startup_ops_cnt = queue.size(); + size_t cnt = 0; // Visit each node exactly once in any order while (!queue.empty()) { GradNodeBase* node = queue.front(); queue.pop(); + if (cnt < potential_startup_ops_cnt) { + if (!node_in_degree_map.count(node)) { + node_in_degree_map[node] = 0; + } + cnt += 1; + } + if (visited.count(node)) { continue; } @@ -76,23 +85,248 @@ std::unordered_map getInDegreeMap( return node_in_degree_map; } -void RunBackward(const std::vector& tensors, - const std::vector& grad_tensors, - bool retain_graph) { - paddle::platform::RecordEvent backward_record_event( - "backward", paddle::platform::TracerEventType::Operator, 1); +// Remove some nodes those doesn't need to be +// stored in potential_stop_nodes、potential_startup_nodes +void UpdateGraphInfo( + std::unordered_map* + target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + // Updated potential_sotp_nodes by depending_nodes, + // make sure the path from root to target_node is ok + std::unordered_set _startup_ops; + VLOG(6) << "Running in UpdateGraphInfo"; + std::queue queue; + for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) { + queue.emplace(target_nodes_inputmeta_pair.first); + } + + while (!queue.empty()) { + auto* target_node = queue.front(); + queue.pop(); + if (!(*depending_nodes)[target_node].empty()) { + auto precedding_nodes = (*depending_nodes)[target_node]; + for (auto pre_nodes : precedding_nodes) { + queue.emplace(pre_nodes); + if (potential_stop_nodes->find(pre_nodes) != + potential_stop_nodes->end()) { + potential_stop_nodes->erase(pre_nodes); + } + } + } else { // startup_ops have no precedding nodes + VLOG(6) << "Emplace _startup_ops"; + _startup_ops.emplace(target_node); + } + } + // Purify potential_startup_nodes again, remove some + // potential startup_nodes that unreach to input target nodes + if (!_startup_ops.empty()) { + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto node : *potential_startup_nodes) { + if (_startup_ops.count(node) == 0) { + VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; + potential_startup_nodes_to_be_erased.emplace(node); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto node : potential_startup_nodes_to_be_erased) { + VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; + potential_startup_nodes->erase(node); + } + } + } +} + +// Get Graph Info Betweent input target gradnode and outputs, +// record depending_nodes、 potential_stop_nodes、potential_startup_nodes +void GetGraphInfoBetweenTargets( + const std::queue& init_queue, + std::unordered_map* + input_target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + if (input_target_nodes_inputmeta_map->empty()) return; + + VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; + + // Calculate in_degree for each node + std::unordered_map node_in_degree_map; + + // Copy nodes + std::queue queue = init_queue; + std::unordered_set visited; + + // Visit each node exactly once in any order + while (!queue.empty()) { + GradNodeBase* node = queue.front(); + queue.pop(); + + if (visited.count(node)) { + continue; + } + visited.insert(node); + + // Check node is target_nodes or not, if node is not target_node, + // all the next_node will be marked in potential_stop_nodes + bool is_potential_stop_nodes = + input_target_nodes_inputmeta_map->count(node); + + // Find and append next nodes + const std::vector>& edges = node->GetEdges(); + for (const auto& edge_list : edges) { + for (const Edge& edge : edge_list) { + GradNodeBase* next_node = edge.GetMutableGradNode().get(); + + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node) continue; + + // if node not in input_target_nodes, + // all the next_nodes of current node will be inserted to + // potential_stop_node + if (is_potential_stop_nodes) { + potential_stop_nodes->emplace(next_node); + } + + // Update in_degree + if (!node_in_degree_map.count(next_node)) + node_in_degree_map[next_node] = 0; + node_in_degree_map[next_node]++; + // Record depending relationship + (*depending_nodes)[next_node].emplace(node); + queue.push(next_node); + } + } + } + // Update Graph Info, remove some stop_node in potential_stop_nodes + UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes, + potential_stop_nodes, potential_startup_nodes); +} + +void GetTargetNodesInfo(const std::vector& inputs, + std::unordered_map* + target_nodes_inputmeta_map) { + VLOG(6) << "Running in GetTargetNodesInfo"; + if (!inputs.empty()) { + VLOG(6) << "Inputs are not empty"; + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < num_inputs; i++) { + AutogradMeta* auto_grad_meta = + EagerUtils::unsafe_autograd_meta(inputs[i]); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + PADDLE_ENFORCE_NOT_NULL(target_node, + paddle::platform::errors::Fatal( + "There is no grad op for input:%d or it's" + "stop_gradient=True", + i)); + (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } + } +} + +std::vector GetResults( + const std::vector& inputs, + std::unordered_map* + results_map, + bool allow_unused, bool create_graph) { + VLOG(6) << "Running in GetResults"; + if (inputs.empty()) return {}; + + std::vector results; + results.reserve(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto& input = inputs[i]; + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + auto iter = results_map->find(target_node); + if (iter != results_map->end()) { + // set StopGradient = !create_graph + AutogradMeta* tensor_auto_grad_meta = + EagerUtils::autograd_meta(&(iter->second)); + tensor_auto_grad_meta->SetStopGradient(!create_graph); + results.emplace_back(iter->second); + } else { + PADDLE_ENFORCE_EQ(allow_unused, true, + paddle::platform::errors::InvalidArgument( + "The %d-th input does not appear in the backward " + "graph. Please check the input variable or set " + "allow_unused=True to get None result.", + i)); + results.emplace_back(); + } + } + return results; +} + +// Enforce GradNode has TensorWrappers as Input +void EnforceGradNodeHasInput(GradNodeBase* node) { + VLOG(6) << "Running in EnforceGradNodeHasInput"; + PADDLE_ENFORCE_NE( + node->IsTensorWrappersCleared(), true, + paddle::platform::errors::Fatal( + "The TensorWrappers of %s do not exist. This may be because:\n" + "You calculate backward twice for the same subgraph without " + "setting retain_graph=True. Please set retain_graph=True in the " + "first backward/grad call.\n", + node->name())); +} + +// Purify potential_startup_nodes, remove nodes those are the same as +// input_target_nodes +void PurifyPotentialStartUpNodes( + std::unordered_set* potential_startup_nodes, + std::unordered_map* + input_target_nodes_inputmeta_map) { + VLOG(6) << "Running in PurifyPotentialStartUpNodes"; + if (input_target_nodes_inputmeta_map->empty()) return; + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto startup_op : *potential_startup_nodes) { + auto iter = input_target_nodes_inputmeta_map->find(startup_op); + if (iter != input_target_nodes_inputmeta_map->end()) { + potential_startup_nodes_to_be_erased.emplace(iter->first); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto nodes : potential_startup_nodes_to_be_erased) { + potential_startup_nodes->erase(nodes); + } + } +} + +std::vector RunBackward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph, bool create_graph = false, + const std::vector& inputs = {}, + bool allow_unused = false, + const std::vector& no_grad_vars = {}) { VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass + std::unordered_map + no_grad_var_nodes_inputmeta_map; + // Get no_grad_vars's GradNodes and InputMeta Info + GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map); + /* --- Initialization --- */ // 1. Init queue with starting nodes // 2. Prepare initial input buffers std::queue queue; std::unordered_map> node_input_buffers_dict; + std::unordered_set potential_startup_nodes; for (size_t i = 0; i < tensors.size(); i++) { const paddle::experimental::Tensor& tensor = tensors[i]; @@ -132,8 +366,17 @@ void RunBackward(const std::vector& tensors, "size = 0 or same size as tensors")); // Feed given tensor if it's provided VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; - node_input_buffers_dict[grad_node]->add( - input_info.first, input_info.second, grad_tensors[i]); + + if (grad_tensors[i].is_initialized()) { + // Deep copy + paddle::experimental::Tensor tmp_tensor; + tmp_tensor.copy_(grad_tensors[i], true); + node_input_buffers_dict[grad_node]->add(input_info.first, + input_info.second, tmp_tensor); + } else { + node_input_buffers_dict[grad_node]->add( + input_info.first, input_info.second, grad_tensors[i]); + } } else { VLOG(6) << "Fill grad input tensor " << i << " with 1.0"; @@ -146,8 +389,9 @@ void RunBackward(const std::vector& tensors, input_info.first, input_info.second, tensor, true /*fill_one=true*/); } - // Prepare queue + // Prepare queue, potential startup_nodes queue.push(grad_node); + potential_startup_nodes.emplace(grad_node); } VLOG(6) << "Update In degree Map for backward"; @@ -155,25 +399,74 @@ void RunBackward(const std::vector& tensors, std::unordered_map node_in_degree_map = getInDegreeMap(queue); + // Get input's GradNodes and InputMeta Info + std::unordered_map + input_target_nodes_inputmeta_map; + GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map); + + // Purify potential_startup_ops, remove those nodes that are the same as + // input_target_nodes + PurifyPotentialStartUpNodes(&potential_startup_nodes, + &input_target_nodes_inputmeta_map); + + // Get Graph Info Betweent input target gradnode and outputs + // Record the depending_nodes and potential_stop_nodes + std::unordered_map /* father node */> + depending_nodes; + std::unordered_set potential_stop_nodes; + // std::unordered_set startup_ops; + + GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map, + &depending_nodes, &potential_stop_nodes, + &potential_startup_nodes); + + // ready_queue store all startup nodes + std::queue ready_queue; + // startup op's indegree should be 0 + for (auto node : potential_startup_nodes) { + if (node_in_degree_map[node] == 0) { + ready_queue.emplace(node); + } + } + + VLOG(1) << " startup_ops' size is :" << ready_queue.size(); + + std::unordered_map results_map; + + // read_queue is empty only when 1.input equals to output. 2.input can not + // reach to output. + if (ready_queue.size() == 0) { + for (auto input_target_node : input_target_nodes_inputmeta_map) { + // out rank_info of forward op + auto rank_info = input_target_node.second->OutRankInfo(); + if (node_input_buffers_dict[input_target_node.first]) { + auto& target_result = + node_input_buffers_dict[input_target_node.first] + ->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[input_target_node.first] = target_result; + } + } + } + /* --- Topological Visit --- */ // 1. Pop queue // 2. Run node + // |- Check and capture target result // |- node(grads) // |- Prepare for next node // 3. Update queue VLOG(6) << "Run Backward"; - while (!queue.empty()) { - GradNodeBase* node = queue.front(); + while (!ready_queue.empty()) { + GradNodeBase* node = ready_queue.front(); + VLOG(6) << "Running GradNode:" << node->name(); + ready_queue.pop(); paddle::platform::RecordEvent node_record_event( std::string(typeid(*node).name()) + " grad_node", paddle::platform::TracerEventType::Operator, 1); - if (queue.size() > 1 && node_in_degree_map[node] != 0) { - queue.pop(); - continue; - } - queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), @@ -184,10 +477,45 @@ void RunBackward(const std::vector& tensors, std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); + // get target grad_var from node_input_buffer by inputmeta + if (input_target_nodes_inputmeta_map.find(node) != + input_target_nodes_inputmeta_map.end()) { + VLOG(6) << "Get target result by by inputmeta"; + // out rank_info of forward op + auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo(); + // rank_info is a pair, first means slot_id, second means rank. + auto& target_result = + node_input_buffer->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[node] = target_result; + } + + // no_grad_vars + if (no_grad_var_nodes_inputmeta_map.find(node) != + no_grad_var_nodes_inputmeta_map.end()) { + VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; + auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo(); + node_input_buffer->SetBufferSlotRankZeros(rank_info.first, + rank_info.second); + } + + VLOG(6) << "Running GradNode:" << node->name(); + + // check input + EnforceGradNodeHasInput(node); + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = - (*node)(node_input_buffer->Buffers()); + (*node)(node_input_buffer->Buffers(), create_graph); + + // retain_grad or not + if (!retain_graph) { + VLOG(6) + << "retain_graph is false, need to clear the TensorWrapper of nodes."; + node->ClearTensorWrappers(); + } + // TODO(jiabin): Should we erase it or find a more efficient way. node_input_buffers_dict.erase(node); @@ -252,18 +580,44 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; + PADDLE_ENFORCE( node_in_degree_map[next_node] >= 0, paddle::platform::errors::Fatal( "Detected in-degree value smaller than zero. For Node: %s" "Node's in-degree cannot be negative", next_node->name())); - if (node_in_degree_map[next_node] == 0) { - queue.emplace(std::move(next_node)); + + bool is_potential_stop_node = potential_stop_nodes.count(next_node); + + if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { + ready_queue.emplace(std::move(next_node)); } } } } + + return GetResults(inputs, &results_map, allow_unused, create_graph); } +void Backward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph) { + VLOG(6) << "Run in Backward"; + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + RunBackward(tensors, grad_tensors, retain_graph); +} + +std::vector Grad( + const std::vector& tensors, // output + const std::vector& inputs, + const std::vector& grad_tensors, + bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, + const std::vector& no_grad_vars) { + VLOG(6) << "Run in Grad"; + return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, + allow_unused, no_grad_vars); +} } // namespace egr diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h index 2856d9fb87f34b1066bb59eb38bcaee786d2a260..bebe664838e6c1f98219ceee6e6733b49c319b3c 100644 --- a/paddle/fluid/eager/backward.h +++ b/paddle/fluid/eager/backward.h @@ -19,12 +19,20 @@ namespace egr { -// run_backward(): +// Backward(): // tensors corresponds to those lived in the backward graph // each grad_tensors[i] keeps the value for its corresponding tensors[i] -void RunBackward(const std::vector &tensors, - const std::vector &grad_tensors, - bool retain_graph = false); +void Backward(const std::vector& tensors, + const std::vector& grad_tensors, + bool retain_graph = false); + +std::vector Grad( + const std::vector& tensors, + const std::vector& inputs, + const std::vector& grad_tensors = {}, + bool retain_graph = false, bool create_graph = false, + bool only_inputs = false, bool allow_unused = false, + const std::vector& no_grad_vars = {}); // Reserved for gradient() diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc index 48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e..72af1cc4b068679e72ae6bdc5e09fab8f56bac04 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -20,8 +20,8 @@ namespace egr { std::vector> RunCustomOpNode:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { paddle::CustomOpKernelContext ctx; auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index e5ddef9c062149282d790a5fd6bf31b25a20cf5a..6ece2658575c795856438904c2716d61f0985879 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph) override; std::string name() { return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); @@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase { return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + void SetAttrs(const std::vector& attr) { attrs_ = attr; } public: diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 16513f05e0777a8e57f54c925d68867dda656612..168e1bcca77ca85eb6fa90a23350d1f62f63dc8e 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -95,8 +95,12 @@ class GradNodeBase { * is better choice to fit this format. * **/ virtual std::vector> operator()( - const std::vector>& grads) = 0; + const std::vector>& grads, + bool create_graph = false) = 0; + virtual void ClearTensorWrappers() = 0; + + virtual bool IsTensorWrappersCleared() = 0; /** * AddEdges is designed to set input tensors' backward Node as current * node's Edges. diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 69fc7df2f1420382735cf59fbe85f7e2207d0f77..163d25e85ce8c085087331c6e3273075aed5e5f4 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -21,6 +21,11 @@ namespace egr { +void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { + buffer_[slot_id][rank] = + paddle::experimental::zeros_like(buffer_[slot_id][rank]); +} + void GradTensorHolder::add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, bool fill_one) { diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index d66a81fe8285980bad4159d5414985dc9c744549..9059b403607461cc980a58d345fe1542aa4b1903 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -56,6 +56,8 @@ class GradTensorHolder { return buffer_; } + void SetBufferSlotRankZeros(size_t slot_id, size_t rank); + private: std::vector> buffer_; }; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 31aaa93c41643f565836c536d7001c01d2a0826d..0e11444b81526de1904b72fc983814314d834a45 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -98,6 +98,8 @@ class TensorWrapper { } } + void clear() { intermidiate_tensor_.reset(); } + private: bool full_reserved_ = false; std::pair out_rank_info_; diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index 1683f4ed5fbe5e4b014e9b369e0231d149c187f1..c8b2d22dcf95139db47704be86a6f64554f7c0ba 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -17,6 +17,14 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT); +#endif namespace eager_test { using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta; @@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) { CHECK_EQ(dt3_tmp_ptr[1], 10.0f); t4.reset(); CHECK(t4.defined() == false); + + VLOG(6) << "Check Tensor Copy_"; + std::vector rows = {1, 2}; + std::vector dims = {2}; + paddle::experimental::Tensor t7(std::make_shared(rows, 2)); + std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->Resize(phi::make_ddim(dims)); + auto* dt7_tmp_ptr = std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->mutable_data(paddle::platform::CPUPlace()); + dt7_tmp_ptr[0] = 6.0f; + dt7_tmp_ptr[1] = 11.0f; + + paddle::experimental::Tensor t8; + paddle::experimental::Tensor t5; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::experimental::Tensor t6; + paddle::experimental::Tensor t9; + VLOG(6) << "Check Tensor Copy_ Selected Rows"; + t8.copy_(t7, paddle::platform::CUDAPlace(0), true); + t9.copy_(t8, paddle::platform::CPUPlace(), true); + auto* dt9_tmp_ptr = std::dynamic_pointer_cast(t9.impl()) + ->value() + .data(); + CHECK_EQ(dt9_tmp_ptr[0], 6.0f); + CHECK_EQ(dt9_tmp_ptr[1], 11.0f); + CHECK_EQ(std::dynamic_pointer_cast(t9.impl())->height(), + 2); + + VLOG(6) << "Check Tensor Copy_ Dense Tensor"; + t5.copy_(t3, paddle::platform::CUDAPlace(0), true); + t6.copy_(t5, paddle::platform::CPUPlace(), true); + auto* dt6_tmp_ptr = + std::dynamic_pointer_cast(t6.impl())->data(); + CHECK_EQ(dt6_tmp_ptr[0], 5.0f); + CHECK_EQ(dt6_tmp_ptr[1], 10.0f); +#else + t5.copy_(t3, paddle::platform::CPUPlace(), true); + auto* dt5_tmp_ptr = + std::dynamic_pointer_cast(t5.impl())->data(); + CHECK_EQ(dt5_tmp_ptr[0], 5.0f); + CHECK_EQ(dt5_tmp_ptr[1], 10.0f); +#endif + VLOG(6) << "Finish"; } diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index 535c93ac53b1751d9634476e47f32dc0cbe22708..0b167203735d65683b0f978fa34fe7f457aae4f2 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode() : GradNodeBase() { val_ = 1.0; } std::string name() override { return "GradTestNode"; } std::vector> operator()( - const std::vector>& grads) - override { + const std::vector>& grads, + bool create_graph = false) override { val_ = std::dynamic_pointer_cast(grads[0][0].impl()) ->data()[0]; phi::DenseTensorMeta meta = @@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase { std::vector> res = {{et1}}; return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } float val_; }; } // namespace eager_test diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 769bd7f687f4584d44bbfa30b73611a3128289bf..887ea3e3acfd50a15206f3e84ab45e16707f80af 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } std::vector target_tensors = {input_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 10) @@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp( reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); std::vector target_tensors = {Out}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { std::unordered_map result = diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd..52dba6b9218c7be8a29ae1aff619facd25a6f3b6 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 0c894ed267fcdd08d44d4df08bfaf0554874aebf..87f8f6eca1f88fe9a54583ee19586dd75c7e231e 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -33,6 +33,7 @@ #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); namespace egr { @@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) { } std::vector outs = {target_tensor}; // Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); @@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { } // Run Backward - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) { } // Use Empty Grad Tensor - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) { node2_ptr->AddEdges(&res2, 0); } - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); eager_test::CompareGradTensorWithValue(leaf_tensor, 2500.0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 36594f1aac8cdb131bb77f1396dca19a0c2e8cc0..8b0759c17ed3712079e8954df60e35afaaf02a9e 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { std::vector res = {meta}; scale_node_ptr->AddEdges(&res, 0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 10.0); diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a..882695e98d109e09340223e21322a02d1b48c6ea 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); VLOG(7) << "Target Grad is: " << std::static_pointer_cast( @@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) { std::vector outs = {out1}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 10.0); @@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); @@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad // leaf grad @@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); // Cross Batch Accumulation - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 60.0); @@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 2.0); @@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { // TODO(jiabin): fix this with add functor // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 2a5ad53204a6201149bec0b3dac0fa3baf441f2e..68820443a2d5a68c73c0d5ebb855519fddbbf3d2 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) { std::vector target_tensors = {output_tensor}; VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue(tensor, 0.25); @@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) { eager_test::CompareTensorWithValue(output_tensor, 96); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue(Y, 3.0 * 4); @@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) { eager_test::CompareTensorWithValue(output_tensor, 5); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue(Y, 1.0); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b03799c48659c579938df6efc0f7cf57bbc0bec --- /dev/null +++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#include "paddle/fluid/eager/api/all.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +namespace egr { + +TEST(Grad, SingleNodeEmptyGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor (output) + paddle::experimental::Tensor output_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + + // Create input tensor + const paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Output_tensor set GradNode、OutRank、StopGradient propertis + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + // Get autograd_meta from input tensor + AutogradMeta* auto_grad_meta1 = + EagerUtils::unsafe_autograd_meta(leaf_tensor); + + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + // input tensor set GradNode、OutRank、StopGradient propertis + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // grad_node Add Edges + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + std::vector outs = {output_tensor}; + + // Run Grad + auto result = Grad(outs, {leaf_tensor}, {}); + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 5.0); +} + +TEST(Grad, SingleNodeCustomGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + std::vector grad_tensors; + // Create Grad Tensor + paddle::experimental::Tensor grad_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Connect Tensor and Node via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* +Node1 + | +Node0 + | + { } // empty grad tensor +*/ +TEST(Grad, LinearNodes) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Target Tensor + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta for node0 + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + + // Set grad in/out meta for node1 + node1_ptr->SetDefaultGradInOutMeta(); + + // Connect Input Tensor and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + // Connect Node0 -> Node1 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node1_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; + node1_ptr->AddEdges(&res1, 0); + } + + // Use Empty Grad Tensor + auto result = Grad(target_tensors, {leaf_tensor}, {}); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* + Node2 + | | +Node0 Node1 + | | + in0 in1 +*/ +TEST(Grad, WithAccumulation) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + std::vector target_tensors; + paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor0)); + target_tensors.emplace_back(std::move(tensor1)); + + // Create Grad Tensor + std::vector grad_tensors; + paddle::experimental::Tensor grad_tensor0 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor grad_tensor1 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor0)); + grad_tensors.emplace_back(std::move(grad_tensor1)); + + paddle::experimental::Tensor leaf_tensor; + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + node1_ptr->SetDefaultGradInOutMeta(); + // Create Node2 + auto node2_ptr = std::make_shared(1, 1); + node2_ptr->SetAttributes_scale(20.0 /*scale*/); + node2_ptr->SetDefaultGradInOutMeta(); + // Connect Inp0 and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta0 = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta0->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta0->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta0->SetStopGradient(false); + // Connect Inp1 and Node1 via AutoGradMeta + AutogradMeta* auto_grad_meta1 = + EagerUtils::autograd_meta(&(target_tensors[1])); + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(node1_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // Connect Node0 -> Node2 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node2_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + // Connect Node1 -> Node2 via Edge + auto meta1 = egr::AutogradMeta(); + meta1.SetStopGradient(false); + meta1.SetSingleOutRankWithSlot(0, 0); + meta1.SetGradNode(node2_ptr); + std::vector res1 = {&meta1}; + node1_ptr->AddEdges(&res1, 0); + + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); + + auto_grad_meta2->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; + node2_ptr->AddEdges(&res2, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + eager_test::CompareTensorWithValue(result[0], 2500.0); +} + +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index d546df4ed087a99a28096a5336fab3826991534a..2c53fc89f650e36f1435c7e1e805453fe7822cf2 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) { leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 4.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); @@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) { leaf_tensor, std::make_shared(hook_function)); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 56813c498d2410caa452da7a334c393b230c65bf..0ee171c73c6600b95b9b093ef7e818855f53002d 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) { } VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue( @@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue( @@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue( diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index d99624e49324853d513a20a725c1a3d12b6aaab5..4eaa64d3ac659ca0ec76083b70855d8b6b241556 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ~GradNodeRunProgram() override = default; // Functor: perform backward computations virtual std::vector> operator()( - const std::vector> &grads) - override { + const std::vector> &grads, + bool create_graph) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; PADDLE_ENFORCE_EQ( grads.size(), 1, @@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase { // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + // SetAttrMap void SetAttrMap(const paddle::framework::AttributeMap &attrs) { attrs_ = attrs; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 623c8a048c2417ab51772c55b681031d9bcfd925..7aaaef712a6e9186058b579d1c69b0cfb201d899 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) +pass_library(mixed_precision_configure_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..4aa59d9196b1b4d73fffa8f1b2a9bba08d6091be --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MixedPrecisionConfigurePass::InsertCastOps( + Graph* graph, const StringSet& blacklist) const { + VLOG(3) << "Insert the cast op before and after the kernel that does not " + "supports fp16 precision"; + + auto update_cast_desc = [&]( + framework::OpDesc& desc, const std::string& x_name, + const std::string& out_name, const int in_dtype, const int out_dtype) { + desc.SetType("cast"); + desc.SetInput("X", {x_name}); + desc.SetOutput("Out", {out_name}); + desc.SetAttr("in_dtype", in_dtype); + desc.SetAttr("out_dtype", out_dtype); + desc.SetAttr("use_mkldnn", false); + desc.SetAttr("with_quant_attr", false); + desc.Flush(); + }; + + auto cast_input = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto inlinks = op_node->inputs; + for (auto* pre_node : inlinks) { + if (pre_node->IsVar()) { + const auto is_persistable = pre_node->Var()->Persistable(); + const auto is_float = + pre_node->Var()->GetDataType() == proto::VarType::FP16 || + pre_node->Var()->GetDataType() == proto::VarType::FP32 || + pre_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* pre_node_input : pre_node->inputs) { + if (!pre_node_input->IsOp()) continue; + const auto& type = pre_node_input->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = pre_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 4, 5); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + op_node->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(pre_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, op_node); + } + } + } + } + } + }; + + auto cast_output = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto outlinks = op_node->outputs; + for (auto* next_node : outlinks) { + if (next_node->IsVar()) { + const auto is_persistable = next_node->Var()->Persistable(); + const auto is_float = + next_node->Var()->GetDataType() == proto::VarType::FP16 || + next_node->Var()->GetDataType() == proto::VarType::FP32 || + next_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* next_node_output : next_node->outputs) { + if (!next_node_output->IsOp()) continue; + + const auto& type = next_node_output->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = next_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 5, 4); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + next_node_output->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(next_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, next_node_output); + } + } + } + } + } + }; + + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + const auto& type = op_node->Op()->Type(); + if (blacklist.count(type)) { + cast_input(graph, op_node, blacklist); + cast_output(graph, op_node, blacklist); + } + } +} + +void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const { + const auto blacklist = + Get>("gpu_fp16_disabled_op_types"); + InsertCastOps(graph, blacklist); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mixed_precision_configure_pass, + paddle::framework::ir::MixedPrecisionConfigurePass); diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fc5a612ecb833d2a5117a2dab58747d21226df8d --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +using StringSet = std::unordered_set; + +class MixedPrecisionConfigurePass : public FusePassBase { + public: + MixedPrecisionConfigurePass() = default; + virtual ~MixedPrecisionConfigurePass() {} + + protected: + void ApplyImpl(Graph* graph) const override; + + private: + void InsertCastOps(Graph* graph, const StringSet& blacklist) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a5c32164bf1a28687ea6f8cc53427db67560c307..74e8ca3f229c6b7093e29cb53c0ce15e0b15d6a9 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -188,6 +188,9 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool); + DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes, + std::unordered_set); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 796c86a3ad1efe45dd8a00139b92c2642676a811..287c896e49bf254d70a5c79c818a39f913472f2f 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->dlnne_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + } else if (pass_name == "mixed_precision_configure_pass") { + pass->Set("gpu_fp16_disabled_op_types", + new std::unordered_set( + argument->gpu_fp16_disabled_op_types())); } if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index daa18d8c78bf875ebcc6571bf955a7f634948e4f..614eea24a0e2ee9d4fabd68a9374fa7c44b63ad7 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" @@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { #else +void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap( + const framework::ir::Graph &graph, + std::unordered_map *var_name_op_type_map) { + std::vector node_list = + framework::ir::TopologyVarientSort( + graph, static_cast(0)); + for (auto *op_node : node_list) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + for (auto *pre_node : op_node->inputs) { + if (pre_node->IsVar() && pre_node->Var()->Persistable()) { + var_name_op_type_map->insert(std::pair( + pre_node->Var()->Name(), op_node->Op()->Type())); + } + } + } +} + void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { if (with_dynamic_shape) { reserve_cpu_weights = true; } + + bool mixed_precision_mode = + argument->Has("use_gpu_fp16") && argument->use_gpu_fp16(); + std::unordered_map var_name_op_type_map{}; + std::unordered_set blacklist{}; + if (mixed_precision_mode) { + GetVarNameToOpTypeMap(graph, &var_name_op_type_map); + blacklist = argument->gpu_fp16_disabled_op_types(); + } + for (auto &var_name : all_vars) { if (std::count(repetitive_params.begin(), repetitive_params.end(), var_name)) { @@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { var->IsType()) { auto *t = var->GetMutable(); - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(t->dims()); - temp_tensor.mutable_data(cpu_place); - - // Copy the parameter data to a tmp tensor. - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - // Reallocation the space on GPU - t->clear(); - - // Copy parameter data to newly allocated GPU space. - paddle::framework::TensorCopySync(temp_tensor, place, t); + bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || + t->dtype() == paddle::experimental::DataType::FLOAT64; + if (mixed_precision_mode && + !blacklist.count(var_name_op_type_map[var_name]) && is_float) { + framework::Tensor half_tensor; + half_tensor.set_type(paddle::experimental::DataType::FLOAT16); + half_tensor.Resize(t->dims()); + auto *half_data = + half_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + auto *data = t->mutable_data(platform::CPUPlace()); + half_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(half_tensor, place, t); + } else { + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } } } } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index d5e98ec886e65f829a1496b1431f23aad6c4bc4c..f8209f051d53444435ed8c65b400f08bf8627553 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_ASCEND_CL void CopyParamsToNpu(Argument *argument); #else - void CopyParamsToGpu(Argument *argument); + + void GetVarNameToOpTypeMap( + const framework::ir::Graph& graph, + std::unordered_map* var_name_op_type_map); + + void CopyParamsToGpu(Argument* argument); #endif }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 41c01d3b7e261314d8dc6b852f5b2a597421fe48..d08d28a3f623389790e63d45e13584a8d0db6adc 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, Update(); } + void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } + void AnalysisConfig::DisableGpu() { use_gpu_ = false; Update(); } +void AnalysisConfig::Exp_EnableUseGpuFp16( + std::unordered_set op_list) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + use_gpu_fp16_ = true; + gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end()); +#else + LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()"; + use_gpu_fp16_ = false; +#endif + + Update(); +} + void AnalysisConfig::DisableFCPadding() { use_fc_padding_ = false; @@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_cudnn_); CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); + CP_MEMBER(use_gpu_fp16_); + CP_MEMBER(gpu_fp16_disabled_op_types_); CP_MEMBER(enable_memory_optim_); // TensorRT related. @@ -573,6 +590,20 @@ void AnalysisConfig::Update() { #endif } + if (use_gpu_fp16_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!enable_ir_optim_) { + LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is " + "enabled."; + } else if (!use_gpu()) { + LOG(ERROR) + << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled."; + } else { + pass_builder()->Exp_EnableUseGpuFp16(); + } +#endif + } + if (use_mkldnn_) { #ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { @@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << params_file_; ss << use_gpu_; + ss << use_gpu_fp16_; + for (auto &item : gpu_fp16_disabled_op_types_) ss << item; ss << use_fc_padding_; ss << gpu_device_id_; ss << xpu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 871ed596a3ee9d6362b03e99ca10313765826a51..6f765ef415e9f2e309672c3f128723e3280e8a54 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -872,6 +872,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } + if (config_.gpu_fp16_enabled()) { + argument_.SetUseGPUFp16(true); + argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d..ecb5eaf982548c44eb97fde7e2b7365c9b0e9fc2 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) { ASSERT_TRUE(!config.use_onnxruntime()); } +TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) { + AnalysisConfig config; + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); + ASSERT_TRUE(config.gpu_fp16_enabled()); +#else + config.DisableGpu(); +#endif + LOG(INFO) << config.Summary(); +} + } // namespace paddle namespace paddle_infer { @@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) { auto predictor = CreatePredictor(config); } +TEST(Predictor, Exp_EnableUseGpuFp16) { + Config config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); +#else + config.DisableGpu(); +#endif + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 7b765e3fa8a24ef1b81b68da8ba12dd8e5577572..bdfe0e46e9ca4519c294a181cda6b8c4b87a6b9b 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void DisableGpu(); + /// + /// \brief Enable GPU fp16 precision computation, in experimental state. + /// + /// \param op_list The operator type list. + /// + void Exp_EnableUseGpuFp16(std::unordered_set op_list = {}); + /// + /// \brief A boolean state telling whether the GPU fp16 precision is turned + /// on. + /// + /// \return bool Whether the GPU fp16 precision is turned on. + /// + bool gpu_fp16_enabled() const { return use_gpu_fp16_; } /// /// \brief Turn on XPU. @@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig { int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool thread_local_stream_{false}; + bool use_gpu_fp16_{false}; + std::unordered_set gpu_fp16_disabled_op_types_{ + "conv2d_fusion", "conv2d", "roll", "strided_slice"}; bool use_cudnn_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 22d9dedb32ebfcc229e0034cc5cf6092907dc8df..95975d8f2a892e709e5591135f96fbff07eb62e3 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() { use_cudnn_ = true; } +void GpuPassStrategy::Exp_EnableUseGpuFp16() { + passes_.assign({ + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + // "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 +// cudnn8.0 has memory leak problem in conv + eltwise + act, so we +// disable the pass. +#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100) + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // +#endif + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", // + "mixed_precision_configure_pass", // + "runtime_context_cache_pass" // + }); + + use_gpu_fp16_ = true; +} + void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 351cf71e5ca7493928dfd81d776d847463f3b7bf..02290ed33ff1cd4f72d707d6f9d23f16e05c321b 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable the use of cuDNN kernel. virtual void EnableCUDNN() {} + /// \brief Enable use gpu fp16 kernel. + virtual void Exp_EnableUseGpuFp16() {} + /// \brief Enable the use of MKLDNN. /// The MKLDNN control exists in both CPU and GPU mode, because there can /// still be some CPU kernels running in GPU mode. @@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } + /// \brief Check if we are using gpu fp16 kernel. + /// \return A bool variable implying whether we are in gpu fp16 mode. + bool use_gpu_fp16() const { return use_gpu_fp16_; } + /// \brief Check if we are using xpu. /// \return A bool variable implying whether we are in xpu mode. bool use_xpu() const { return use_xpu_; } @@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_npu_{false}; bool use_ipu_{false}; bool use_mkldnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; @@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Enable the use of cuDNN kernel. void EnableCUDNN() override; + /// \brief Enable the use of gpu fp16 kernel. + void Exp_EnableUseGpuFp16() override; + /// \brief Not supported in GPU mode yet. void EnableMKLDNN() override; @@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { protected: /// \cond Protected bool use_cudnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 9f7e4fb8d5749cf6bd54ed3e3bf9699199c0d3e6..70597be393c35e6939b83d86ce2f9be8f2c36805 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) { extern __shared__ char* shared_max_data_tmp[]; auto shared_max_data = reinterpret_cast(shared_max_data_tmp); if (gridDim.x > 1) { - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T tmp = abs(in[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; } else { if (bid < n) { shared_max_data[tid] = abs(in[bid]); @@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n, int channel_size = n / c; const T* in_c = in + blockIdx.x * channel_size; extern __shared__ T shared_max_data[]; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = tid; i < channel_size; i += blockDim.x) { T tmp = fabs(in_c[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { @@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n, int tid = threadIdx.x; int bid = blockIdx.x; const T* in_current = in + tid * cout_wh_size + bid * wh_size; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = 0; i < wh_size; i++) { T tmp = fabs(in_current[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); int len = blockDim.x; @@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor { } }; +template +__global__ void FindMovingAverageAbsMaxKernel(const T* in_state, + const T* in_accum, + const T* cur_scale, const T rate, + T* out_state, T* out_accum, + T* out_scale) { + T state = rate * (*in_state) + T(1.0f); + T accum = rate * (*in_accum) + (*cur_scale); + *out_state = state; + *out_accum = accum; + *out_scale = accum / state; +} + template struct FindRangeAbsMaxFunctor; template @@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor { framework::Tensor* out_accum, framework::Tensor* out_scale) { const auto gpu_place = ctx.GetPlace(); - T accum; - T state; - T scale; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - ctx.stream()); - ctx.Wait(); - T rate_t = static_cast(rate); - state = rate_t * state + static_cast(1.0); - accum = rate_t * accum + scale; - scale = accum / state; - - memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); - ctx.Wait(); + T* out_state_data = out_state->mutable_data(gpu_place); + T* out_accum_data = out_accum->mutable_data(gpu_place); + T* out_scale_data = out_scale->mutable_data(gpu_place); + + FindMovingAverageAbsMaxKernel<<<1, 1, 0, ctx.stream()>>>( + in_state.data(), in_accum.data(), cur_scale, rate_t, + out_state_data, out_accum_data, out_scale_data); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 6ee9582dacde372886075bb7c5619c6bc1b99c98..f6d3fd898469113dcffce76a84e4c292603707c6 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -15,9 +15,13 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -27,43 +31,6 @@ using Tensor = framework::Tensor; class GridSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler"); - OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler"); - - auto x_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(X) of GridSampleOp should be 4-D Tensor, but " - "received X dimension size(%d)", - x_dims.size())); - PADDLE_ENFORCE_EQ(grid_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(Grid) of GridSampleOp should be 4-D Tensor, " - "but received X dimension size(%d)", - grid_dims.size())); - if (ctx->IsRuntime() || grid_dims[3] > 0) { - PADDLE_ENFORCE_EQ( - grid_dims[3], 2, - platform::errors::InvalidArgument( - "Input(Grid) dimension[3] should be 2, but received %d", - grid_dims[3])); - } - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - grid_dims[0], x_dims[0], - platform::errors::InvalidArgument( - "Input(X) and Input(Grid) dimension[0] should be equal, but " - "received X dimension[0](%d) != Grid dimension[0](%d)", - x_dims[0], grid_dims[0])); - } - - ctx->SetOutputDim("Output", - {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); - ctx->ShareLoD("X", "Output"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { class GridSampleOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", - framework::GradVarName("X"), "grid_sampler"); - auto input_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), input_dims); - } - if (ctx->HasOutput(framework::GradVarName("Grid"))) { - ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -224,10 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor, + PD_INFER_META(phi::GridSampleBaseInferMeta)); REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker, - ops::GridSampleGradMaker); -REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); + ops::GridSampleGradMaker, + GridSamplerInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); +REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad, + GridSamplerGradInferShapeFunctor); REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index e110432c67d395c865d934a47eaa4a803053db8b..c9e80c7b4b407456fc962f508ae441a9c07914b2 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, EAGER_TRY auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); - egr::RunBackward(tensors, grad_tensors, - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + egr::Backward(tensors, grad_tensors, + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); Py_INCREF(Py_None); return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); + auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); + auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2); + auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); + auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5); + auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6); + auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7); + + std::vector result = + egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars); + VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; + return ToPyObject(result, true /* return_py_none_if_not_initialize */); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, METH_VARARGS | METH_KEYWORDS, NULL}, + {"run_partial_grad", + (PyCFunction)(void (*)(void))eager_api_run_partial_grad, + METH_VARARGS | METH_KEYWORDS, NULL}, {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 082ec382c79cd9c98ac75db14bc552883088b885..7f8fcd351fe2a0a9712560f913a83f2cc3580395 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -226,6 +226,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto cp_tensor = + self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true); + egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); + egr::EagerUtils::autograd_meta(&cp_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(cp_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_reconstruct_from_(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -264,7 +277,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args, egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable()); } - self->tensor.copy_(src_tensor, blocking); + self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking); VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to " << self->tensor.name(); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 2572866b8f5198b2414163d4198e06b54d11fedc..ff8980d727e70a41223878f22f019353f8b71972 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, "Detected NULL grad" "Please check if you have manually cleared" "the grad inside autograd_meta")); - grad->copy_(src, true); + grad->copy_(src, self->tensor.inner_place(), true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 217edad0c0a105cc649c6c8c4433b0c8eab0119b..97bb32630d71368de2dee205fbef186a8551d9c7 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -492,20 +492,26 @@ PyObject* ToPyObject(const std::vector& value) { return result; } -PyObject* ToPyObject(const std::vector& value) { +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize) { PyObject* result = PyList_New((Py_ssize_t)value.size()); for (size_t i = 0; i < value.size(); i++) { - PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); - if (obj) { - auto v = reinterpret_cast(obj); - new (&(v->tensor)) paddle::experimental::Tensor(); - v->tensor = value[i]; + if (!value[i].initialized() && return_py_none_if_not_initialize) { + Py_INCREF(Py_None); + PyList_SET_ITEM(result, static_cast(i), Py_None); } else { - PADDLE_THROW(platform::errors::Fatal( - "tp_alloc return null, can not new a PyObject.")); + PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->tensor)) paddle::experimental::Tensor(); + v->tensor = value[i]; + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + PyList_SET_ITEM(result, static_cast(i), obj); } - PyList_SET_ITEM(result, static_cast(i), obj); } return result; diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 2187555e1c3c7f64bd864e4212bfc6ebe1fb1684..1c4e2ab69a5ecba1209a11651c3c11972dff565c 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); -PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize = false); PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const framework::LoDTensor* value); PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b008308e27d9afaa9d8c47290489d50a762f2a41..c8f0acd0b8a853f541a6fb8cbafe73f27688c71a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) { .def("params_file", &AnalysisConfig::params_file) .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) + .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16, + py::arg("gpu_fp16_disabled_op_types") = + std::unordered_set({})) .def("enable_xpu", &AnalysisConfig::EnableXpu, py::arg("l3_workspace_size") = 16 * 1024 * 1024, py::arg("locked") = false, py::arg("autotune") = true, diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 4e273f6d551edd74ec979e6ec34aedabdb58bd10..e777a8e3ab4e6a59662ce7b4eb9a31a7409d6f56 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -3,12 +3,22 @@ if (NOT WITH_INFRT) endif() option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) +option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF) +option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) if (INFRT_WITH_PHI) - add_definitions("-DINFRT_WITH_PHI") + add_definitions("-DINFRT_WITH_PHI") + + # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later. + if (INFRT_WITH_GPU) + add_definitions("-DINFRT_WITH_GPU") + if (INFRT_WITH_TRT) + add_definitions("-DINFRT_WITH_TRT") + endif() + endif() endif() # compile flags @@ -92,7 +102,6 @@ set(infrt_mlir_incs test_kernels_inc tensor_shape_inc dense_tensor_inc - pd_ops_inc pd_extra_ops_inc trt_ops_inc ) @@ -106,6 +115,9 @@ if (INFRT_WITH_PHI) endif() cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) +if (INFRT_WITH_TRT) + target_link_libraries(infrt infrt_trt) +endif() cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h index c8f97e04a1b8376efbac749fffa70d77c7b95e72..6e3bef9299162d493825f49e3962c75f2845e2d0 100644 --- a/paddle/infrt/backends/host/phi_allocator.h +++ b/paddle/infrt/backends/host/phi_allocator.h @@ -13,6 +13,10 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" +#ifdef INFRT_WITH_GPU +#include +#endif + namespace infrt { namespace backends { @@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator { } }; +#ifdef INFRT_WITH_GPU +// TODO(wilber): Just for demo test. we need a more efficient gpu allocator. +class GpuPhiAllocator : public phi::Allocator { + public: + static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); } + + AllocationPtr Allocate(size_t bytes_size) { + void* ptr; + cudaMalloc(&ptr, bytes_size); + return AllocationPtr( + new phi::Allocation( + ptr, bytes_size, phi::Place(phi::AllocationType::GPU)), + deleter); + } +}; +#endif + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h index 5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262..bcd63dbb39fe8c52499138423bc9b86fa5de9d57 100644 --- a/paddle/infrt/backends/host/phi_context.h +++ b/paddle/infrt/backends/host/phi_context.h @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace infrt { namespace backends { @@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext { std::unique_ptr alloc_{std::make_unique()}; }; +class GpuPhiContext : public phi::GPUContext { + public: + using Base = phi::GPUContext; + using phi::GPUContext::SetStream; + using phi::GPUContext::SetEigenDevice; + using phi::GPUContext::SetBlasHandle; + using phi::GPUContext::SetDnnHandle; + using phi::GPUContext::SetSolverHandle; + using phi::GPUContext::SetSparseHandle; +}; + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907..0ab64dd51c88758d043fb9105ffbf0d109e44cc0 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -37,9 +37,9 @@ namespace infrt { namespace backends { namespace tensorrt { -const char* model_input = "model_input"; -const char* model_output = "model_output1"; -const char* model_output2 = "model_output2"; +const char* model_input = "input_0"; +const char* model_output = "output_0"; +const char* model_output2 = "output_1"; TrtUniquePtr ConstructNetwork( nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { @@ -122,27 +122,26 @@ TEST(trt, run_static) { std::unordered_map inputs; inputs.emplace(std::make_pair(model_input, &input)); - phi::DenseTensor output, output2; - std::unordered_map outputs; - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - static_trt_engine.SetUpInference(inference_options, inputs, &outputs); + static_trt_engine.PrepareOutputHandle("output_0"); + static_trt_engine.PrepareOutputHandle("output_1"); + static_trt_engine.SetUpInference(inference_options, inputs); static_trt_engine.GetEngineInfo(); static_trt_engine.Run(context); + phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0"); + phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 28 * 28, 0); std::vector output_data2(inference_options.batch * 2 * 28 * 28, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); @@ -208,27 +207,27 @@ TEST(trt, run_dynamic) { context.stream()); std::unordered_map inputs; - std::unordered_map outputs; inputs.emplace(std::make_pair(model_input, &input)); - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - engine.SetUpInference(inference_options, inputs, &outputs); + engine.PrepareOutputHandle("output_0"); + engine.PrepareOutputHandle("output_1"); + engine.SetUpInference(inference_options, inputs); engine.GetEngineInfo(); engine.Run(context); + phi::DenseTensor* output0 = engine.GetOutput("output_0"); + phi::DenseTensor* output1 = engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 16 * 16, 0); std::vector output_data2(inference_options.batch * 2 * 16 * 16, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index 232653e8c41f71fd9bb32c9eac302b047d122b66..43d356b6d6983afdca220029d34d9d5cd27da009 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -21,6 +21,7 @@ #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace backends { @@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, return true; } +void TrtEngine::PrepareOutputHandle(const std::string& out_name) { + phi::DenseTensor t; + outputs_.emplace(out_name, t); +} + +phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { + return &outputs_[name]; +} + +size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } + bool TrtEngine::SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs) { + const std::unordered_map& inputs) { // TODO(wilber): now only create one exec_context FreshDeviceId(); CHECK(engine_ != nullptr); @@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference( bindings_.front()->AddBinding( bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT); } - for (auto& it : *outputs) { + for (auto& it : outputs_) { const int bind_index = engine_->getBindingIndex(it.first.c_str()); bindings_.front()->AddBinding( - bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT); + bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT); } return true; @@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { const int bind_index = engine_->getBindingIndex(bind.name.c_str()); std::vector ddim; auto dims = engine_->getBindingDimensions(bind_index); + CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1."; ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; ++i) { ddim.push_back(dims.d[i]); } bind.buffer->Resize(phi::make_ddim(ddim)); + // TODO(wilber): now only support float output. ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); buffers[bind_index] = static_cast(bind.buffer->data()); } diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index 3c8243e3c3838e30eb70877f8c82d623c103eaff..a26474f8cbb357d42cd6d951829bbdc24a256640 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -81,11 +81,17 @@ class TrtEngine { // TODO(wilber): How to support multiple execution contexts? bool SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs); + const std::unordered_map& inputs); void GetEngineInfo(); + void PrepareOutputHandle(const std::string& out_name); + + // TODO(wilber): The output tensor names are: output_0, output_1, ... + phi::DenseTensor* GetOutput(const std::string&); + + size_t GetOutputNum() const; + private: void FreshDeviceId(); @@ -112,6 +118,7 @@ class TrtEngine { std::vector> bindings_; int device_id_{0}; bool is_dynamic_shape_{false}; + std::unordered_map outputs_; }; } // namespace tensorrt diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index a3f2d0afafc417cc7a4cbba8a3d6bfa92c9bef00..cf3906c32e559d9fa33d0583be9adb1b2591e78b 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS dense_tensor.cc mlir_loader.cc diagnostic_utils.cc - pd_ops.cc ) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) -mlir_tablegen_on(pd_op_base DIALECT pd) -mlir_tablegen_on(pd_ops) -mlir_tablegen_on(pd_extra_ops) - -mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code add_executable(infrtopt opt.cc) @@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) -add_dependencies(print-ir pd_ops_inc) cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) add_subdirectory(infrt) +add_subdirectory(pd) add_subdirectory(tensorrt) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 666c7b300af33db0c27e5b3ab8a74aa4b1591c9b..59df4e9697370e9d8db4bbc0a5d69e8ef03950a5 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { } def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { - let summary = "ddt.tensor_map_get_size operation"; + let summary = "dt.tensor_map_get_size operation"; let description = [{ An operation that get the size of a TensorMap. @@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; } +def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> { + let summary = "dt.tensor_list_get_tensor operation"; + + let description = [{ + An operation that can get a tensor from a TensorList. + }]; + + let arguments = (ins + DenseTensorList:$l, + I32Attr:$id + ); + let results = (outs DenseTensor:$output); + let verifier = ?; +} + +def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> { + let summary = "dt.tensor_list_get_size operation"; + + let description = [{ + An operation that get the size of a TensorList. + }]; + + let arguments = (ins DenseTensorList:$map); + let results = (outs I32:$size); +} + def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> { let summary = "dt.get_tensor_shape operation"; diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td index c5130e89bb13a58a0aa0cf3aeae1b00e269eb259..86cfc375330b19878528645a2e810efb797e153f 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -89,6 +89,13 @@ def DenseTensorMap : Infrt_Type<"DenseTensorMap"> { let parameters = (ins); } +// TODO(wilber): Add !infrt.vec type. +def DenseTensorList : Infrt_Type<"DenseTensorList"> { + let summary = "infrt dense tensor map"; + let description = [{dense_tensor map}]; + let parameters = (ins); +} + // Type Constrait for concrete DenseTensor type. class DenseTensor : Type, diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 3a1b45d3a20a1e3ff6698f37e412837fcb064f7c..8966ca13c2be08f1c744a73b4beaf20b0a3c015c 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -138,6 +138,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { parser.getContext(), *targetType, *precisionType, *layoutType); } + if (keyword == "tensor_list") { + return infrt::DenseTensorListType::get(parser.getContext()); + } + if (keyword == "dense_tensor_map") { return DenseTensorMapType::get(parser.getContext()); } @@ -175,6 +179,9 @@ void InfrtDialect::printType(::mlir::Type type, return; } + if (type.isa()) { + os << "tensor_list"; + } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { os << "dense_tensor_map"; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index 7ae0bbae6275fdac1ea9e98084c866aa438ecce4..3d825a9c762f4833e577125d20423a5f1d41737f 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" def FuseTensorCastPattern : Pat< (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)), diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 9d8ce5d8dfe399d06bfbe2f0c4b6457a8b3d61f1..eec0e0bc7c5ab624e9db7744c357b58ff5107eef 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -16,7 +16,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace { #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 0c5944ebf84750be8cf789552219157da3170c39..6183295cafb356e85c0fd8bf417c3fb18eb30787 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -20,12 +20,13 @@ #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" #include "paddle/infrt/dialect/tensor_shape.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT @@ -37,7 +38,8 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT phi::PHIDenseTensorDialect, phi::PHICPUKernelDialect, phi::PHIGPUKernelDialect, - phi::PHIDialect + phi::PHIDialect, + infrt::trt::TensorRTDialect #endif >(); } diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f65336453fbdf82f30948aeea8dc52b0367159b --- /dev/null +++ b/paddle/infrt/dialect/pd/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(common) +add_subdirectory(ir) +add_subdirectory(pass) diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee1b0d4c30deb2e7fbf19aa91ec3dd3bdcd449af --- /dev/null +++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt @@ -0,0 +1,4 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + ) diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aacfc97623c0dadc0ccb604440ce19427d860ba --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt @@ -0,0 +1,7 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + pd_ops.cc + ) +add_mlir_dialect(pd_ops pd) +mlir_tablegen_on(pd_extra_ops) diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td similarity index 90% rename from paddle/infrt/dialect/pd_extra_ops.td rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td index c6d3f530455f76d0352ef5ac42297c30ce521da2..cf17db211cbe98c586423c7db050dfdc12576cff 100644 --- a/paddle/infrt/dialect/pd_extra_ops.td +++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td @@ -4,7 +4,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/LoopLikeInterface.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/pd_op_base.td" +include "paddle/infrt/dialect/pd/ir/pd_op_base.td" def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> { let summary = "Computes the Fully Connected result of two tensors"; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td similarity index 96% rename from paddle/infrt/dialect/pd_op_base.td rename to paddle/infrt/dialect/pd/ir/pd_op_base.td index f6af4c83aed8bd0b7ce04c172169b036e674777b..7cab0eca45a1e7f74115f906db10a77f2eb1023b 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td @@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -def PD_Dialect : Dialect { +def Paddle_Dialect : Dialect { let name = "pd"; let description = [{ @@ -16,12 +16,12 @@ def PD_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; - + let hasConstantMaterializer = 1; let cppNamespace = "mlir::pd"; } class PD_Op traits = []> : - Op; + Op; class PD_PaddleAttr : diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..d105aa07dd06a9a9c3aba870702b1e304a3a938a --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + +#include +#include + +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT + +namespace mlir { +namespace pd { +void PaddleDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT + , +#define GET_OP_LIST +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT + >(); +} + +mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, + mlir::Attribute value, + mlir::Type type, + mlir::Location loc) { + return builder.create(loc, value); +} + +void ConstantOp::build(OpBuilder &builder, + OperationState &state, + Attribute value) { + if (auto elem_attr = value.dyn_cast()) { + return ConstantOp::build(builder, state, elem_attr); + } else if (value.isa()) { + ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); + state.addAttribute("value", DenseElementsAttr::get(type, value)); + state.addTypes(type); + return; + } + llvm_unreachable("unsupported attribute type for building pd.constant"); +} + +LogicalResult ConstantOp::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(attributes.get("value").getType()); + return success(); +} +mlir::OpFoldResult ConstantOp::fold( + ::llvm::ArrayRef operands) { + return value(); +} +} // namespace pd +} // namespace mlir diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8383ff6ed8201c4f8948ebaa4effaac3d783cc52 --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/pd_ops.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +//===----------------------------------------------------------------------===// +// Dialect +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc" diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..827df597b76e2ec5b4cf639c984a425f9be8b6c9 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt @@ -0,0 +1,8 @@ + +core_gather_headers() + +gather_srcs(infrt_src SRCS + pd_op_fuse_pass.cc + ) + +mlir_add_rewriter(pd_op_fuse) diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td similarity index 97% rename from paddle/infrt/dialect/rewrite.td rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td index 62e7471a390dfeee1a9ddfc15033e85db0adca2e..f5a8ea78d7d9da5cc70b50d31836b4f4933d5853 100644 --- a/paddle/infrt/dialect/rewrite.td +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td @@ -3,8 +3,8 @@ include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/pd_ops.td" -include "paddle/infrt/dialect/pd_extra_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td" //===----------------------------------------------------------------------===// // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'. diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8bdf957db27d8c2b20025931a76826628feddbdd --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT + +#include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + +namespace { +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc" // NOLINT + +/* + * PdOpFusePass. + */ +struct PdOpFusePass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "PdOpFusePass"; } + + llvm::StringRef getArgument() const override { return "pd-op-fuse"; } + + void runOnFunction() override; +}; + +// Implementation of the PdOpFusePass. +void PdOpFusePass::runOnFunction() { + ::mlir::RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); +} + +} // namespace + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..854545ab1a2638224e16a300bfccb1f953f81c77 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +/* + * PdOpFusePass. + */ +std::unique_ptr CreatePdOpFusePass(); + +} // namespace infrt diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc deleted file mode 100644 index 96e9e307f2fd3f33be3d2273a7aa66c363e4beb1..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pd_ops.cc +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/pd_ops.h" - -#include -#include - -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT - -namespace mlir { -namespace pd { - -#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT - -PaddleDialect::PaddleDialect(MLIRContext *context) - : Dialect("pd", context, TypeID::get()) { - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT - , -#define GET_OP_LIST -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT - >(); -} - -mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, - mlir::Attribute value, - mlir::Type type, - mlir::Location loc) { - return builder.create(loc, value); -} - -void ConstantOp::build(OpBuilder &builder, - OperationState &state, - Attribute value) { - if (auto elem_attr = value.dyn_cast()) { - return ConstantOp::build(builder, state, elem_attr); - } else if (value.isa()) { - ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); - state.addAttribute("value", DenseElementsAttr::get(type, value)); - state.addTypes(type); - return; - } - llvm_unreachable("unsupported attribute type for building pd.constant"); -} - -LogicalResult ConstantOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(attributes.get("value").getType()); - return success(); -} -mlir::OpFoldResult ConstantOp::fold( - ::llvm::ArrayRef operands) { - return value(); -} -/* -LogicalResult ElementwiseAdd::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} -*/ - -void Elementwise_addOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -/* -mlir::OpFoldResult ElementwiseAdd::fold( - llvm::ArrayRef operands) { - if (getElementTypeOrSelf(getType()).isa()) { - if (!operands[0] || !operands[1]) return {}; - DenseElementsAttr lhs = operands[0].dyn_cast(); - DenseElementsAttr rhs = operands[1].dyn_cast(); - if (!lhs || !rhs) return {}; - ShapedType type = getType().template cast(); - if (!type.hasStaticShape()) return {}; - Type etype = type.getElementType(); - if (!etype.isa()) return {}; - SmallVector values; - values.reserve(lhs.getNumElements()); - for (const auto zip : - llvm::zip(lhs.getValues(), rhs.getValues())) { - values.push_back( - std::plus()(std::get<0>(zip), std::get<1>(zip))); - } - return DenseElementsAttr::get(type, values); - } - return {}; -} - -LogicalResult ElementwiseDiv::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseMul::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseSub::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult MulOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -void ReluOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void FusedRepeatedFCRelu::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void BatchNormOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -}*/ - -} // namespace pd -} // namespace mlir diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h deleted file mode 100644 index e6b0f30c059054189fe3a86bb112da923ad76423..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pd_ops.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" - -namespace mlir { -namespace pd { - -class PaddleDialect : public Dialect { - public: - explicit PaddleDialect(MLIRContext* context); - - static StringRef getDialectNamespace() { return "pd"; } - - /// A hook used to materialize constant values with the given type. - Operation* materializeConstant(OpBuilder& builder, - Attribute value, - Type type, - Location loc) override; - - Type parseType(DialectAsmParser& parser) const override { - return Dialect::parseType(parser); - } - void printType(Type type, DialectAsmPrinter& printer) const override { - Dialect::printType(type, printer); - } -}; - -} // namespace pd -} // namespace mlir - -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.hpp.inc" -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc" diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 8c3a79498d74d3b80e1590bbc2c0530c7af6411e..1fda2d9d8886008c6415b5a1cf36d53c1500707a 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect { class PDT_Op traits = []> : Op {} -class CreateDenseTensorOp - : PDT_Op<"create_dense_tensor", [NoSideEffect]> { +class CreateDenseTensorOp + : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> { let arguments = (ins Context:$context, I64ArrayAttr:$dims, LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision); let results = (outs DenseTensor:$output); @@ -51,9 +51,11 @@ class CreateContextOp let results = (outs Context:$output); } -def PDT_CreateDenseTensorOp : CreateDenseTensorOp; +def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">; +def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">; def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateCPUContextOp : CreateContextOp<"cpu">; +def PDT_CreateGPUContextOp : CreateContextOp<"gpu">; def PDT_PrintDenseTensor : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index f9e124aba6c28695f2c0fafa91404d2d10db8eea..13cba6eeabb669cf93deb9a37d87d2ddff66e5c0 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -32,6 +32,7 @@ #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/ops/compat/signatures.h" namespace { @@ -94,42 +95,49 @@ void PhiOpConvertPass::convertStage() { // Todo: print log continue; } - - ::phi::KernelSignature kernel_sign = - ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( - infrt::ProtoArgumentMappingContext(op)); - // resort input&output according to kernel_sign - ::llvm::SmallVector inputs, ori_output; - ::llvm::SmallVector output_types; - for (const std::string &str : std::get<0>(kernel_sign.args)) { - if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { - LOG(ERROR) << "No input info for Op " << op_name << " and argument " - << str; - return; + auto loc = getFunction().getLoc(); + builder.setInsertionPoint(op); + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) { + std::string kernel_name = phi::TransToPhiKernelName(op_name); + auto kernel_op = builder.create(loc, + op->getResultTypes(), + op->getOperands(), + kernel_name, + op->getAttrDictionary()); + op->replaceAllUsesWith(kernel_op.getResults()); + } else { + ::phi::KernelSignature kernel_sign = + ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + infrt::ProtoArgumentMappingContext(op)); + // resort input&output according to kernel_sign + ::llvm::SmallVector inputs, ori_output; + ::llvm::SmallVector output_types; + for (const std::string &str : std::get<0>(kernel_sign.args)) { + if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No input info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); + inputs.push_back(op->getOperands()[index]); } - uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); - inputs.push_back(op->getOperands()[index]); - } - for (const std::string &str : std::get<2>(kernel_sign.args)) { - if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { - LOG(ERROR) << "No output info for Op " << op_name << " and argument " - << str; - return; + for (const std::string &str : std::get<2>(kernel_sign.args)) { + if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No output info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); + output_types.push_back(op->getResultTypes()[index]); + ori_output.push_back(op->getResult(index)); + } + auto kernel_op = builder.create( + loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); + for (size_t index = 0; index < ori_output.size(); ++index) { + ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); - output_types.push_back(op->getResultTypes()[index]); - ori_output.push_back(op->getResult(index)); - } - - auto loc = getFunction().getLoc(); - builder.setInsertionPoint(op); - auto kernel_op = builder.create( - loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); - for (size_t index = 0; index < ori_output.size(); ++index) { - ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - CHECK(op->use_empty()); op->erase(); } diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8..7d08c32161b751d8363be5a4e2024b2b691b1dd4 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" #include "paddle/phi/core/compat/arg_map_context.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td index 46c250b05492cefe61d8e677a352a217718189b8..6467c1285f85e0c8bfca7b873ce64a09a52074ff 100644 --- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" include "paddle/infrt/dialect/tensorrt/trt_ops.td" def PD2TRT_Matmul_Lower : Pat< diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2..e22a2309cbe2d343fab4e6e918d3c5a3f98cbb4e 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -17,11 +17,12 @@ #include #include #include -#include #include #include #include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + namespace infrt { namespace trt { namespace { diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index e3a7b455024c65d40ccbafb28fba9e9b0ead0369..f81179e548fd5fb15850e9b8943bce440dc3091c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 83bebdb6bf19bdf8f75d11d693813b8169e297a0..1e6a3e1380555ea94b0d5de9d64cdc42a27e894e 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 9f348b4122fc74033703c92459e6cfa5b3a1f3a2..2c6f08277c803fe744bbfe559f21a6b8b085b816 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -17,7 +17,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index d5222976625a2adece9a87c8952dba10137ae9ba..415a78a6967ab6fd4e2a38380d09a5d5c64b1c2f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -21,6 +21,10 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" + namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 78d960b5120454bdd01b779abedbe2f7ec0d5853..76768037dbdb3072976d9f6cf0cdfb4f7956bdd4 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -30,7 +30,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 132a1d7805bdb85af8716e384ec29357a6ff68ad..31b28a38e7cfee4eb6da68302d482218d97f8350 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/tensorrt/trt_op_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "trt CreateEngine Op"; @@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator< Describe a tensorrt subgraph. }]; let regions = (region SizedRegion<1>:$body); - let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); - let results = (outs TRT_EngineType:$output); + let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); + let results = (outs TRT_EngineType:$engine); } def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { @@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { let description = [{ Describe a tensorrt runtime. }]; - let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); - let results = (outs Variadic:$output); + let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); + let results = (outs Variadic:$output); +} + +def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> { + let summary = "trt compute engine"; + let description = [{ + execute engine + }]; + let arguments = (ins TRT_EngineType:$engine, Context:$context); + let results = (outs DenseTensorList:$outputs); +} + +def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> { + let summary = "trt inspect engine"; + let description = [{ + Show engine + }]; + let arguments = (ins TRT_EngineType:$engine); } def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { @@ -34,11 +53,11 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { TensorRT IActivationLayer. }]; - let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type, + let arguments = (ins DenseTensor:$input, SI32Attr:$activation_type, DefaultValuedAttr:$alpha, DefaultValuedAttr:$beta); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { @@ -48,9 +67,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { TensorRT IElementWiseLayer. }]; - let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation); + let arguments = (ins DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { @@ -60,10 +79,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { TensorRT IMatrixMultiplyLayer. }]; - let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1, - TRT_Tensor:$input2, BoolAttr:$transpose2); + let arguments = (ins DenseTensor:$input1, BoolAttr:$transpose1, + DenseTensor:$input2, BoolAttr:$transpose2); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } #endif // TRT_OPS diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 319df90d3eec133d3f02be6749e9ad379fd225fd..81bf873ddf0cf3f1a94489bd3b0b2769274b1b4a 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -33,7 +33,10 @@ #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" -#endif +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) +#include "paddle/infrt/kernel/tensorrt/registry.h" +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT +#endif // INFRT_WITH_PHI static llvm::cl::list cl_shared_libs( // NOLINT "shared_libs", @@ -62,6 +65,9 @@ int main(int argc, char** argv) { #ifdef INFRT_WITH_PHI kernel::RegisterPhiKernels(®istry); kernel::RegisterInferShapeLaunchers(®istry); +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) + kernel::RegisterTrtKernels(®istry); +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT #endif // load extra shared library diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index c613843cd1779599fbac5aea6042b26b151534e8..3d5cccb5c32694ff05d10811bbff0f068bd6bc51 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -16,12 +16,14 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -42,6 +44,13 @@ #include "paddle/infrt/host_context/value.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace host_context { @@ -277,33 +286,58 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); VLOG(3) << "processing general op : " << op->getName().getStringRef().str(); + // TODO(wilber): Find a more appropriate way to handle special cases. + if (op->getName().getStringRef() == "trt.create_engine") { +#ifdef INFRT_WITH_TRT + auto* symbols = impl_->runtime->symbol_table(); + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation; + mlir_operation.operation = op; + mlir_operation.symbol_table = symbols; + impl_->cur_op->AppendArgument(new Value(mlir_operation)); + // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy + // add a naive implement. + for (int i = 0, e = op->getNumOperands(); i < e; ++i) { + auto operand = op->getOperand(i); + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + if (arg_value->is_type()) { + impl_->runtime->FeedInArgs( + std::make_pair(std::to_string(i), ValueRef(arg_value))); + } + } + } +#else + CHECK(false) << "should not reach here"; +#endif + } else { + // process operands + for (int i = 0, e = op->getNumOperands(); i < e; i++) { + // function argument as value + auto operand = op->getOperand(i); + /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + impl_->cur_op->AppendArgument(arg_value); + VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " + << GetValue(arg); + continue; + } - // process operands - for (int i = 0, e = op->getNumOperands(); i < e; i++) { - // function argument as value - auto operand = op->getOperand(i); - /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { - if (operand.isa()) { - mlir::BlockArgument arg = operand.dyn_cast(); - Value* arg_value = GetValue(arg); + // normal value + Value* arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + CHECK(arg_value) << "No-exist argument value found: " + << DumpToString(operand); impl_->cur_op->AppendArgument(arg_value); - VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " - << GetValue(arg); - continue; - } - // normal value - Value* arg_value = GetValue(operand); - if (!arg_value) { - auto upstream_op = operand.getDefiningOp(); - arg_value = GetOpResult(upstream_op); + VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " + << GetValue(operand) << " vs " << arg_value; } - CHECK(arg_value) << "No-exist argument value found: " - << DumpToString(operand); - impl_->cur_op->AppendArgument(arg_value); - - VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " - << GetValue(operand) << " vs " << arg_value; } // process attributes @@ -383,33 +417,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(tmp[i]); } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - if (res.getType().isa<::infrt::DenseTensorType>()) { - auto r = impl_->value_map.try_emplace( - res, ValueRef(new Value{::phi::DenseTensor()})); - CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) - << "]"; - res_values.push_back(r.first->second.get()); - } else { - res_values.push_back(AddValue(res)); - } - - VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); - } - impl_->cur_op->SetResults(res_values); - -#ifdef INFRT_DEBUG - { - VLOG(3) << "check result"; - for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { - VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; - } - } -#endif - // process regions, we treat regions as attribute. auto num_regions = op->getNumRegions(); if (num_regions > 0) { @@ -438,6 +445,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(new Value(function)); } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + if (res.getType().isa<::infrt::DenseTensorType>()) { + auto r = impl_->value_map.try_emplace( + res, ValueRef(new Value{::phi::DenseTensor()})); + CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) + << "]"; + res_values.push_back(r.first->second.get()); + } else { + res_values.push_back(AddValue(res)); + } + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + return true; } diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 96aecb755898466ee9530cb3bdc894e51d074cd7..48999a23ef34cd119081810fb4baac77f5fb123b 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/host_context/paddle_mlir.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index e825cbb5a11ea0dfcacfc2b1bbb63bf201219c9d..d5f1209b9925b6f2bb916cdd99024a5782485365 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -14,22 +14,22 @@ #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ +#include +#include +#include +#include +#include +#include #include #include #include -#include "llvm/Support/CommandLine.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/AsmState.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/MLIRContext.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" - #include "paddle/infrt/dialect/init_dialects.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/paddle/model_parser.h" diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 957d852442b10620244e230a2f7704eb7fa0a33e..1f0b1dabd94d8dcf28e8e0543a8e3b12ed250704 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -24,6 +24,7 @@ #include "paddle/infrt/common/shared.h" #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/host_context/symbol_table.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" #include "paddle/infrt/tensor/dense_tensor_view.h" @@ -41,7 +42,15 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" -#endif + +#ifdef INFRT_WITH_GPU +#include "paddle/phi/backends/gpu/gpu_context.h" +#endif // INFRT_WITH_GPU +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif // INFRT_WITH_TRT +#endif // INFRT_WITH_PHI namespace infrt { namespace host_context { @@ -72,8 +81,13 @@ using ValueVariantType = ::phi::MetaTensor, ::phi::DenseTensor, backends::CpuPhiContext, +#ifdef INFRT_WITH_GPU + backends::GpuPhiContext, + ::phi::GPUContext, +#endif ::phi::CPUContext, std::vector, + std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, std::vector, @@ -81,6 +95,10 @@ using ValueVariantType = paddle::experimental::Backend, paddle::experimental::DataLayout, paddle::experimental::DataType, +#ifdef INFRT_WITH_TRT + ::infrt::backends::tensorrt::TrtEngine, + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol, +#endif // INFRT_WITH_TRT #endif std::vector, std::vector, @@ -120,8 +138,18 @@ class Value : public common::Object { #ifdef INFRT_WITH_PHI explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_GPU + explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {} + explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {} +#endif explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_TRT + explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) + : data(std::move(x)) {} + explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x) + : data(x) {} +#endif // INFRT_WITH_TRT #endif template diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index f1cbfba1c46b33e461a7c9f08cf646625fbafb24..f20344f6f6b84ae8e63f44c7b7b83c6ba9d8d6da 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(phi) +add_subdirectory(tensorrt) core_gather_headers() diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 39ef172fadef9e0f6317dec192c251c6a1df6828..b27eacf9e522d2bbb8b7ffd70ad57f54e5775499 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -25,6 +25,16 @@ namespace phi { return ctx; } +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext() { + ::phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); + context.PartialInitWithAllocator(); + return context; +} +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 3e9580b91da5724b42c72224847e45715f47dbb7..ae3f76c8fe536f96689680668cc52e4981894063 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -25,6 +25,10 @@ namespace phi { ::phi::CPUContext CreateCPUContext(); +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext(); +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 777fb29ac60d9c7125898752747bbdf553f370c0..6d16b814c6b02b08e279190d5a685d65c124942d 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -15,6 +15,12 @@ #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/place.h" + +#ifdef INFRT_WITH_GPU +#include +#endif namespace infrt { namespace kernel { @@ -34,26 +40,83 @@ namespace phi { {})); } +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision) { + return ::phi::DenseTensor( + const_cast<::phi::Allocator*>(&context.GetAllocator()), + ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), + ::phi::make_ddim(dims.get()), + ConvertLayoutToPhi(layout.get()), + {})); +} + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> value) { - auto place = ::phi::CPUPlace(); + auto place = dense_tensor->place(); float* a_data = dense_tensor->mutable_data(place); - for (int64_t i = 0; i < dense_tensor->numel(); ++i) { - a_data[i] = (value.get())[i]; + if (place.GetType() == ::phi::AllocationType::CPU) { + for (int64_t i = 0; i < dense_tensor->numel(); ++i) { + a_data[i] = (value.get())[i]; + } + } else if (place.GetType() == ::phi::AllocationType::GPU) { +#ifdef INFRT_WITH_GPU + // TODO(wilber): how to set the stream parameter to copy with stream. + cudaMemcpy(a_data, + value.get().data(), + sizeof(float) * value.get().size(), + cudaMemcpyHostToDevice); +#endif + } else { + llvm_unreachable("temporarily not support other target."); } } void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { -#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ - case ::phi::DataType::PHI_DATATYPE: { \ - DTYPE* data = dense_tensor->data(); \ - if (dense_tensor->numel() == 0) break; \ - std::cout << data[0]; \ - for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ - std::cout << "," << data[i]; \ - } \ - break; \ +#ifndef INFRT_WITH_GPU +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } \ + break; \ + } +#else +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } else if (place.GetType() == ::phi::AllocationType::GPU) { \ + std::vector host_data(dense_tensor->numel(), 0); \ + cudaMemcpy(host_data.data(), \ + data, \ + sizeof(DTYPE) * dense_tensor->numel(), \ + cudaMemcpyDeviceToHost); \ + std::cout << host_data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << host_data[i]; \ + } \ + } else { \ + llvm_unreachable("temporarily not support other target."); \ + } \ + break; \ } +#endif ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 8cc0e39e0e4431f073ac37a7f0557f2c837dc753..47d89506e2aa615b0bc425a4c373c904d937e03f 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -30,6 +30,13 @@ namespace phi { host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision); +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision); + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values); void PrintDenseTensor(::phi::DenseTensor* dense_tensor); diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 0e071418603f8390ca3283f617b06cf1fa91b94c..36d40118f16a0bd1779765064caaac6dbe414772 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.create_context.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext)); registry->AddKernelWithAttrs( - "phi_dt.create_dense_tensor", + "phi_dt.create_dense_tensor.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor), {"dims", "lod", "layout", "precision"}); registry->AddKernelWithAttrs( @@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { {"value"}); registry->AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); + +#ifdef INFRT_WITH_GPU + registry->AddKernel("phi_dt.create_context.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext)); + registry->AddKernelWithAttrs( + "phi_dt.create_dense_tensor.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), + {"dims", "lod", "layout", "precision"}); +#endif } } // namespace kernel diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index b7503aa4ef35894dda514fdb7fa4336485323094..79502f9fdfd4bd88666f61ff30bc526325b91341 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -25,6 +25,10 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace kernel { using namespace host_context; // NOLINT @@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute name) { int32_t TensorMapGetSize(TensorMap map) { return map.size(); } +// TODO(wilber): Maybe we should place TensorList type in dt dialect. +#ifdef INFRT_WITH_PHI +phi::DenseTensor TensorListGetTensor(std::vector list, + Attribute idx) { + CHECK_LT(idx.get(), static_cast(list.size())) + << "idx should less than list size"; + return *list[idx.get()]; +} + +int32_t TensorListGetSize(const std::vector &list) { + return list.size(); +} +#endif + DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; } template @@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { INFRT_KERNEL(TensorMapGetTensor)); registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize)); +// TensorList related methods. +#ifdef INFRT_WITH_PHI + registry->AddKernel("dt.tensor_list_get_tensor", + INFRT_KERNEL(TensorListGetTensor)); + registry->AddKernel("dt.tensor_list_get_size", + INFRT_KERNEL(TensorListGetSize)); +#endif + registry->AddKernel("dt.shallow_copy_tensor", INFRT_KERNEL(ShallowCopyTensor)); diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd35fccbe2aa35453a4d4ac13364ef6bb5a6b6aa --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt @@ -0,0 +1,10 @@ +if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT)) + return() +endif() + +core_gather_headers() + +gather_srcs(infrt_src SRCS + registry.cc + trt_kernels.cc +) diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..a37e3c0f7f2785e23c8a0b9a25d3283396215f70 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/registry.h" + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" + +namespace infrt { +namespace kernel { + +void RegisterTrtKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("trt.create_engine", + INFRT_KERNEL(tensorrt::CreateTrtEngine)); + registry->AddKernel("trt.inspect_engine", + INFRT_KERNEL(tensorrt::PrintTrtLayer)); + registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h new file mode 100644 index 0000000000000000000000000000000000000000..762329ca61d02a16edc150854afcc3dd431a941d --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +namespace host_context { + +struct KernelRegistry; + +} // namespace host_context +} // namespace infrt + +namespace infrt { +namespace kernel { + +/** + * Register all the trt kernels to registry. + */ +void RegisterTrtKernels(host_context::KernelRegistry* registry); + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc new file mode 100644 index 0000000000000000000000000000000000000000..04847ac8982f861ab2799bd23b1c2ab723422327 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#include +#include "NvInfer.h" +#include "NvInferRuntime.h" +#include "NvInferRuntimeCommon.h" +#include "glog/logging.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/backends/tensorrt/trt_options.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol + create_engine_op /*, input_tensors, output_tensors, weights*/) { + // TODO(wilber): The device_id needs to get from mlir. + int device_id = 0; + backends::tensorrt::TrtEngine engine(device_id); + + auto* builder = engine.GetTrtBuilder(); + // TODO(wilber): How to process weights? + backends::tensorrt::TrtUniquePtr network; + // TODO(wilber): static_shape or dynamic_shape network? The code is just + // static_shape test. + network.reset(builder->createNetworkV2(0)); + + // TODO(wilber): The build option shoule be fiiled from mlir info. + backends::tensorrt::BuildOptions options; + options.max_batch = 4; + + // Parse mlir Region which only has one block. + mlir::Operation& operation = *create_engine_op.operation; + auto* symbol_table = create_engine_op.symbol_table; + CHECK_NOTNULL(symbol_table); + + unsigned int num_regions = operation.getNumRegions(); + CHECK_EQ(num_regions, 1U) << "only support one region case."; + auto& region = operation.getRegion(0); + auto& block = region.getBlocks().front(); + + llvm::DenseMap map_info; + std::unordered_map trt_bind_inputs; + + for (auto index_operand : llvm::enumerate(operation.getOperands())) { + mlir::Value operand = index_operand.value(); + size_t idx = index_operand.index(); + + const std::string input_name = "input_" + std::to_string(idx); + auto* v = symbol_table->GetValue(std::to_string(idx)); + CHECK_NOTNULL(v); + auto* t = &v->get(); + trt_bind_inputs[input_name] = t; + // TODO(wilber): get input info from mlir. + // TODO(wilber): input dims, now only support static_shape, and just remove + // the first dimension. + // TODO(wilber): now only suppot float input. + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = + network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + map_info[operand] = in; + } + + // TODO(wilber): Find a way to add layer. + for (auto& inner_op : block.without_terminator()) { + if (inner_op.getName().getStringRef() == "trt.Activation") { + trt::ActivationOp act_op = llvm::dyn_cast(inner_op); + auto in_arg = act_op.getOperand(); + if (!map_info.count(in_arg)) { + CHECK(false) << "map_info not has in_arg."; + } + nvinfer1::ActivationType act_type = + static_cast(act_op.activation_type()); + auto* act_layer = network->addActivation(*map_info[in_arg], act_type); + act_layer->setAlpha(act_op.alpha().convertToFloat()); + act_layer->setBeta(act_op.beta().convertToFloat()); + for (size_t i = 0; i < act_op->getNumResults(); ++i) { + nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); + mlir::Value act_out = act_op->getResult(i); + map_info[act_out] = act_out_tensor; + } + } + + // if (inner_op.getName().getStringRef() == "trt.Constant") { + // trt::ConstantOp op = llvm::dyn_cast(inner_op); + // mlir::Value op_out = op.getResult(); + // std::vector weight_data{1}; + // auto* layer = network->addConstant(nvinfer1::Dims2(1, 1), + // nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1}); + // auto* op_out_tenor = layer->getOutput(0); + // map_info[op_out] = op_out_tenor; + // } + } + for (auto& inner_op : block.without_terminator()) { + for (mlir::Value v : inner_op.getResults()) { + for (mlir::Operation* user : v.getUsers()) { + if (user->getName().getStringRef() == "infrt.return") { + if (!map_info.count(v)) { + CHECK(false) << "map_info not has value"; + } + network->markOutput(*map_info[v]); + } + } + } + } + // std::unordered_map trt_bind_outputs; + mlir::Operation* ret = block.getTerminator(); + for (unsigned int i = 0; i < ret->getNumOperands(); ++i) { + mlir::Value arg = ret->getOperand(i); + CHECK(map_info.count(arg)); + map_info[arg]->setName(("output_" + std::to_string(i)).c_str()); + } + for (int i = 0; i < network->getNbOutputs(); ++i) { + engine.PrepareOutputHandle(network->getOutput(i)->getName()); + } + + VLOG(3) << "trt engine build start."; + engine.Build(std::move(network), options); + VLOG(3) << "trt engine build done."; + + // TODO(wilber): get inference options from mlir. + backends::tensorrt::InferenceOptions inference_options; + inference_options.batch = 1; + // TODO(wilber): bind trt input/output tensors. + engine.SetUpInference(inference_options, trt_bind_inputs); + return engine; +} + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) { + engine->GetEngineInfo(); +} + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) { + engine->Run(context); + std::vector res; + for (size_t i = 0; i < engine->GetOutputNum(); ++i) { + res.push_back(engine->GetOutput("output_" + std::to_string(i))); + } + return res; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..546ee9dc78852e6967bf8b61ae81563d32beae66 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "mlir/IR/Operation.h" + +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace infrt { +namespace host_context { +class SymbolTable; +} // namespace host_context + +namespace kernel { +namespace tensorrt { + +struct MlirOperationWithInfrtSymbol { + mlir::Operation* operation; + ::infrt::host_context::SymbolTable* symbol_table; +}; + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol engine_op); + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine); + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context); + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/tests/dialect/disabled_trt.mlir b/paddle/infrt/tests/dialect/disabled_trt.mlir new file mode 100644 index 0000000000000000000000000000000000000000..ef86dcf1e72a04c478a7763000cf366715665d81 --- /dev/null +++ b/paddle/infrt/tests/dialect/disabled_trt.mlir @@ -0,0 +1,37 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%0 : !infrt.dense_tensor, %ctx : !phi.context) { + %a = "trt.create_engine"(%0) ({ + %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1) : (!infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %t = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + + "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor) -> () + + //%res = + infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor, !phi.context) -> () + //-> (!infrt.dense_tensor) + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir similarity index 97% rename from paddle/infrt/tests/dialect/rewrite.mlir rename to paddle/infrt/tests/dialect/pd/rewrite.mlir index 9fbb09e22449ff98a28b9e22732351ddbbc49dd0..ea0248b9d95d28e0160192a44f4c542d50a4892d 100644 --- a/paddle/infrt/tests/dialect/rewrite.mlir +++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt --canonicalize %s | FileCheck %s +// RUN: infrtopt --pd-op-fuse %s | FileCheck %s // CHECK-LABEL: @main func @main() -> tensor { %a = "pd.feed"() {name="input0"} : () -> tensor diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index 3657777a5b0bce1c5a5e4df8d59695f8b122da56..b8cb1a5cec2a17d3f6d15036249fcf9f7f711948 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -3,7 +3,7 @@ // CHECK-LABEL: @sign_any_float32_execute func @sign_any_float32_execute() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) { + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) { precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 5b0fa735897a31287bb6dea487e2f22eacd7b0aa..21ee8ebf0b705894446192b0d5d0bfeb9f10f326 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -6,7 +6,7 @@ module { } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor phi_dt.print_tensor(%2 : !infrt.dense_tensor) diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir index e3cb9670bec015e58e2a538bb55dfbe7c8b7f554..7bdf62a277896afe2f8a5e156fa8183742f1d853 100644 --- a/paddle/infrt/tests/dialect/trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -1,16 +1,16 @@ // RUN: trt-exec %s // CHECK-LABEL: @main -func @main(%bias:tensor, %c:tensor, %b1:tensor, %b2:tensor, %bias1:tensor, %bias2:tensor) -> tensor { - %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor, tensor) -> tensor - %e = "pd.relu6"(%d) {} : (tensor) -> tensor +func @main(%bias:!infrt.dense_tensor, %c:!infrt.dense_tensor, %b1:!infrt.dense_tensor, %b2:!infrt.dense_tensor, %bias1:!infrt.dense_tensor, %bias2:!infrt.dense_tensor) -> !infrt.dense_tensor { + %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor - %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor, tensor) -> tensor - %e1 = "pd.relu"(%d1) {} : (tensor) -> tensor + %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor - %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor + %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - infrt.return %e2 : tensor + infrt.return %e2 : !infrt.dense_tensor } diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index c268742fa567bffecb2fd17a773ab56aee019853..ce40627bb0d3742aff7f60583d2e0b9cbbd8fb02 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -324,7 +324,7 @@ class PADDLE_API Tensor final { * * @return std::shared_ptr */ - std::shared_ptr impl() const; + const std::shared_ptr& impl() const; /** * @brief Set the implemention of current Tensor. @@ -333,6 +333,13 @@ class PADDLE_API Tensor final { */ void set_impl(const std::shared_ptr& impl); + /** + * @brief Set the implemention of current Tensor. + * + * @param impl + */ + void set_impl(std::shared_ptr&& impl); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief Get the stream where the tensor is currently located @@ -397,7 +404,9 @@ class PADDLE_API Tensor final { * @param blocking, Should we copy this in sync way. * @return void */ - void copy_(const Tensor& src, const bool blocking); + void copy_(const Tensor& src, + const phi::Place& target_place, + const bool blocking); /** * @brief Cast datatype from one to another * diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 42bf7a8103f837195775b33daf301a7d2e0f4c44..4cbca07236208281f38984022d17b6cb88af8ed8 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta) diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index e1ebe8c6465cfdd7f8213c0a31416bc77412221c..0c11e2df65d0db23b4e080bf041c78d976714013 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -95,12 +95,8 @@ paddle::optional MakeMetaTensor( /* ------------------ for output ----------------------- */ phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { - if (!out->initialized()) { - auto dense_tensor = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); - out->set_impl(dense_tensor); - return dense_tensor.get(); + if (out->impl() == nullptr) { + out->set_impl(std::make_shared()); } return static_cast(out->impl().get()); } @@ -111,9 +107,7 @@ std::vector SetKernelOutput(size_t out_size, out->reserve(out_size); std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { - auto tensor_ptr = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); + auto tensor_ptr = std::make_shared(); results[i] = tensor_ptr.get(); out->emplace_back(); out->back().set_impl(tensor_ptr); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2..e280ab626da74a9b0951925f7472fa49996691cb 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, if (NeedTransformPlace( out.place(), target_args_def.backend, transform_flag)) { - phi::DenseTensor result( - phi::make_intrusive( - phi::TransToPhiPlace(target_args_def.backend)), - {out.dtype(), out.dims(), out.layout()}); + phi::DenseTensor result; framework::TransDataDevice( out, phi::TransToPhiPlace(target_args_def.backend), &result); out = result; @@ -190,14 +187,14 @@ std::shared_ptr PrepareData( tensor_in->dtype(), target_args_def.dtype, transform_flag) && !NeedTransformLayout( tensor_in->layout(), target_args_def.layout, transform_flag))) { - return std::dynamic_pointer_cast(tensor_in); + return std::static_pointer_cast(tensor_in); } phi::DenseTensor out = TransformData(*(static_cast(tensor_in.get())), target_args_def, transform_flag); - return std::make_shared(out); + return std::make_shared(std::move(out)); } std::shared_ptr PrepareData( diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 40174a505dcc9b3d475254b7cec7691300c7aecf..6be85d720007e8464647974f43d42f8430a827a8 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -46,6 +46,7 @@ limitations under the License. */ * In the future, the necessary components will be moved to the this library, * or the corresponding components will be re-implemented. */ + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" @@ -142,7 +143,12 @@ PlaceType Tensor::place() const { } paddle::platform::Place Tensor::inner_place() const { - return ConvertExtPlaceToInnerPlace(place()); + PADDLE_ENFORCE_NOT_NULL( + impl_, + phi::errors::PermissionDenied( + "Null pointer error, the impl_ of Tensor should not be " + "Null when calling Tensor::inner_place().")); + return impl_->place(); } bool Tensor::is_cpu() const { @@ -286,12 +292,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { } } -std::shared_ptr Tensor::impl() const { return impl_; } +const std::shared_ptr &Tensor::impl() const { return impl_; } void Tensor::set_impl(const std::shared_ptr &impl) { impl_ = impl; } +void Tensor::set_impl(std::shared_ptr &&impl) { + impl_ = std::move(impl); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t Tensor::stream() const { return platform::stream::get_current_stream(-1)->raw_stream(); diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 885e29b27fa8e723ad0e89f9a99c2accd3c172f6..cc797507e68ec11005d6ac35d5dca2d19418598d 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -19,9 +19,12 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace experimental { - // declare cast api Tensor cast(const Tensor &x, DataType out_dtype); Tensor copy_to(const Tensor &x, Backend backend, bool blocking); @@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to>( template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; -void Tensor::copy_(const Tensor &src, bool blocking) { +void Tensor::copy_(const Tensor &src, + const phi::Place &target_place, + bool blocking) { if (!src.is_initialized()) { + VLOG(8) << "Src is empty, skip copy"; return; } + // Prepare copy kernel key and outputs + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + KernelType kernel_type = ParseKernelTypeByInputArgs(src); VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); - if (defined()) { + if (is_initialized()) { PADDLE_ENFORCE_EQ(dtype(), src.dtype(), platform::errors::PreconditionNotMet( @@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) { "Copy cannot be performed!", name(), src.name())); + PADDLE_ENFORCE_EQ(target_place, + inner_place(), + platform::errors::PreconditionNotMet( + "Place is different of dst tensor and args %s, which " + "current tensor holds %s " + "Copy cannot be performed!", + target_place.DebugString(), + inner_place().DebugString())); + kernel_key_set.backend_set = + kernel_key_set.backend_set | + BackendSet(phi::TransToPhiBackend(inner_place())); + } else { + // Deep Copy AutoGrad info from src to self. + *autograd_meta_ = *(src.autograd_meta_); + } + + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + + if (kernel_type == KernelType::DENSE_TENSOR_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::DenseTensor &, + phi::Place, + bool, + phi::DenseTensor *); + SetKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy_sr", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::SelectedRows &, + phi::Place, + bool, + phi::SelectedRows *); + SetSelectedRowsKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "We currently only support dense tensor copy for now and if u need to " + "copy selected rows please raise a issue.")); } - auto copy_tensor = - src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking); - set_impl(copy_tensor.impl()); } } // namespace experimental diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index a3b252598582bc212ba66f9c18ec52e035a29a68..0394835aa8b700ba4f9ee9b106661e2d70fc50b6 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -741,6 +741,10 @@ struct GPUContext::Impl { GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique()) {} +GPUContext::GPUContext(GPUContext&&) = default; + +GPUContext& GPUContext::operator=(GPUContext&&) = default; + GPUContext::GPUContext(const GPUPlace& place) : DeviceContext(), impl_(std::make_unique(place)) {} diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 3eb4360ad35382369681308b46050cc3e6e04ea0..cd08da1c0f2f8031a461a0410a89254823a6a903 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -77,6 +77,8 @@ class DnnWorkspaceHandle { class GPUContext : public DeviceContext { public: GPUContext(); + GPUContext(GPUContext&&); + GPUContext& operator=(GPUContext&&); explicit GPUContext(const GPUPlace& place); diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 0947870dcd35a689d7ebfde5ddcab12c361358e9..9bf692703860f15601ad601970ea1f5b1316442b 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce) diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index be91409762635e8aabdd6953aa5527d94959e4b2..e502b9cb3e02536e8d764a4cbc5e1d5509960303 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -197,8 +197,16 @@ class Kernel { const KernelArgsDef& args_def() const { return args_def_; } + const TensorArgDef& InputAt(size_t idx) const { + return args_def_.input_defs().at(idx); + } + TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); } + const TensorArgDef& OutputAt(size_t idx) const { + return args_def_.output_defs().at(idx); + } + TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); } bool IsValid() { return fn_ != nullptr; } diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 38dce0dc69d317d95541f3f10ba8018b03b9d6b5..521f2a9bf0648948cb66ae7ebcd623a4f02fa9f2 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -571,6 +571,48 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto grid_dims = grid.dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(X) of GridSampleOp should be 4-D Tensor, but " + "received X dimension size(%d)", + x_dims.size())); + PADDLE_ENFORCE_EQ(grid_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(Grid) of GridSampleOp should be 4-D Tensor, " + "but received X dimension size(%d)", + grid_dims.size())); + if (config.is_runtime || grid_dims[3] > 0) { + PADDLE_ENFORCE_EQ( + grid_dims[3], + 2, + phi::errors::InvalidArgument( + "Input(Grid) dimension[3] should be 2, but received %d", + grid_dims[3])); + } + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + grid_dims[0], + x_dims[0], + phi::errors::InvalidArgument( + "Input(X) and Input(Grid) dimension[0] should be equal, but " + "received X dimension[0](%d) != Grid dimension[0](%d)", + x_dims[0], + grid_dims[0])); + } + + out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + void HuberLossInferMeta(const MetaTensor& input, const MetaTensor& label, float delta, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 8cf7ce3930e941a3c5243306fa38e4466059509a..9e1a35640ad296933623b3df75d576a5ee5dfc24 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -103,6 +103,11 @@ void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void HuberLossInferMeta(const MetaTensor& input_meta, const MetaTensor& label_meta, float delta, diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf71ab0583f6120e7bf10f26f00024b27a56ca79 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/selected_rows/copy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst) { + if (src.value().Holder() != dst->value().Holder() || + src.value().data() != dst->value().data()) { + dst->set_rows(src.rows()); + dst->set_height(src.height()); + } + phi::Copy( + dev_ctx, src.value(), dst_place, blocking, dst->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4aa848bea2a717ffcda4dff562ec56a702b7dbc5 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst); + +} // namespace sr +} // namespace phi diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 8149d69d36a27fadcefa8dc6b6ff1dd89792e29e..9439982858530e1e81156be4b32ef2d91dc4a33a 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -565,16 +565,25 @@ def grad(outputs, if isinstance(in_out_list, (list, tuple)): assert len(in_out_list) > 0, "{} cannot be empty".format(name) for each_var in in_out_list: - assert isinstance( - each_var, - core.VarBase), "Elements of {} must be Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager. + Tensor), "Elements of {} must be Tensor".format(name) + else: + assert isinstance( + each_var, + core.VarBase), "Elements of {} must be Variable".format( + name) return in_out_list else: - assert isinstance( - in_out_list, - core.VarBase), "{} must be Variable or list of Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + in_out_list, core.eager. + Tensor), "{} must be Tensor or list of Tensor".format(name) + else: + assert isinstance( + in_out_list, core.VarBase + ), "{} must be Variable or list of Variable".format(name) return [in_out_list] outputs = check_in_out(outputs, 'outputs') @@ -586,9 +595,14 @@ def grad(outputs, for each_var in grad_outputs: if each_var is not None: - assert isinstance( - each_var, core.VarBase - ), "grad_outputs must be None, a Variable or a list containing None or Variables" + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager.Tensor + ), "grad_outputs must be None, a Variable or a list containing None or Variables" + else: + assert isinstance( + each_var, core.VarBase + ), "grad_outputs must be None, a Variable or a list containing None or Variables" else: grad_outputs = [] @@ -600,14 +614,27 @@ def grad(outputs, no_grad_vars = [] elif isinstance(no_grad_vars, core.VarBase): no_grad_vars = [no_grad_vars] + elif isinstance(no_grad_vars, core.eager.Tensor): + no_grad_vars = [no_grad_vars] elif isinstance(no_grad_vars, (list, tuple, set)): no_grad_vars = list(no_grad_vars) for var in no_grad_vars: - assert isinstance( - var, core.VarBase), "no_grad_vars can only contains Variable" + if core._in_eager_mode(): + assert isinstance( + var, + core.eager.Tensor), "no_grad_vars can only contains Tensor" + else: + assert isinstance( + var, + core.VarBase), "no_grad_vars can only contains Variable" else: - raise AssertionError( - "no_grad_vars must be None, Variable or list/tuple/set of Variables") + if core._in_eager_mode(): + raise AssertionError( + "no_grad_vars must be None, Tensor or list/tuple/set of Tensors") + else: + raise AssertionError( + "no_grad_vars must be None, Variable or list/tuple/set of Variables" + ) assert isinstance(create_graph, bool), "create_graph must be True or False" @@ -622,6 +649,11 @@ def grad(outputs, assert isinstance(only_inputs, bool), "only_inputs must be True or False" assert only_inputs, "only_inputs=False is not supported yet" + if core._in_eager_mode(): + return core.eager.run_partial_grad( + outputs, inputs, grad_outputs, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars) + place = core.Place() place.set_place(framework._current_expected_place()) return core.dygraph_partial_grad(inputs, outputs, grad_outputs, diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index f58952d3036c506341955eff2472079bb696bb1f..a36164a277dec0762e7ba49a1d158837f27bc517 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -30,6 +30,7 @@ from paddle.fluid.layers import nn from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import in_dygraph_mode +from paddle import _C_ops __all__ = ['TranslatedLayer'] @@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path, return var_dict +def _valid_vars(vars): + if vars: + return vars + if framework._in_eager_mode(): + return [ + core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + else: + return [ + core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + + def _run_dygraph(instance, input, program_holder): # 1. prepare inputs, outputs, attrs @@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder): # hold forward variables if framework._in_eager_mode(): - tmp_scope_vec = core.eager.Tensor( - dtype=core.VarDesc.VarType.FP32, - dims=[], - name="program_out_scope", - type=core.VarDesc.VarType.STEP_SCOPES, - persistable=True) + tmp_scope_vec = [program_holder.scope] else: tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) - tmp_scope_vec.value().set_scope(program_holder.scope) + tmp_scope_vec.value().set_scope(program_holder.scope) double_grad_vars = [] for var_desc in program_holder.double_grad_descs: @@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder): var_desc.shape(), var_desc.name(), var_desc.type(), False) double_grad_vars.append(var) - if len(double_grad_vars) == 0: - if framework._in_eager_mode(): - double_grad_vars = [ - core.eager.Tensor( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] - else: - double_grad_vars = [ - core.VarBase( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] # 2. run program by op trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program end_op_index = program_holder.infer_program.block(0).op_size() - framework._dygraph_tracer().trace_op( - type='run_program', - inputs={'X': input_vars, - 'Params': persistable_vars}, - outputs={ - 'Out': output_vars, - 'OutScope': tmp_scope_vec, - 'DOut': double_grad_vars - }, - attrs={ - 'global_block': trace_program.block(0), - 'start_op_index': 0, - 'end_op_index': end_op_index, - 'is_test': instance._is_test, - 'program_id': _hash_with_id(trace_program, instance) - }) + attrs = ('global_block', trace_program.block(0), 'start_op_index', 0, + 'end_op_index', end_op_index, 'is_test', instance._is_test, + 'program_id', _hash_with_id(trace_program, instance)) + _C_ops.run_program( + _valid_vars(input_vars), + _valid_vars(persistable_vars), + _valid_vars(output_vars), tmp_scope_vec, + _valid_vars(double_grad_vars), *attrs) # NOTE: [ why need set param's gradient type here ] # if user set sparse gradient mode, the param's gradient # will be SelectedRows, not LoDTensor. But tracer will just @@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder): def drop_scope_if_no_grad(instance, scope_vec): tracer = framework._dygraph_tracer() + scope = scope_vec.value().get_scope() if isinstance(scope_vec, ( + core.VarBase)) else scope_vec[0] if (not instance._is_test) and (not tracer._has_grad): - scope_vec.value().get_scope().drop_kids() + scope.drop_kids() def _run_static_graph(input, program_holder, trace_program): diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index b1865691b2475c4f855f51244e627965047d7720..1e1ce3ba7e4912d391085c7acbd7aa4bbb6a4da1 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs): for var in flatten(input_spec): if isinstance(var, paddle.static.InputSpec): inner_input_spec.append(var) - elif isinstance(var, (core.VarBase, Variable)): + elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)): inner_input_spec.append( paddle.static.InputSpec.from_tensor(var)) else: diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 53dbf1a66b27f35a75b44a0b6444cd8282c5278c..6957850d205794363183b4e6ca58a6daf3e11358 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -760,7 +760,8 @@ class Layer(object): raise KeyError("The name of buffer can not be empty.") elif hasattr(self, name) and name not in self._buffers: raise KeyError("attribute '{}' already exists.".format(name)) - elif tensor is not None and not type(tensor) == core.VarBase: + elif tensor is not None and not (type(tensor) == core.VarBase or + type(tensor) == core.eager.Tensor): raise TypeError( "The registered buffer should be a core.VarBase, but received {}.". format(type(tensor).__name__)) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cbe360f556cd986646bc7f45b3a80ab0f5edb9eb..c82172780b7b2e27e430d0494ce59f7dce626d74 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1118,9 +1118,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200) - set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index cac64c7391351b23c4b9f9275c4b20bf85f571fd..2b8307461b8f57ea73503cf6ad4e8a90cdba652c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator +from paddle.fluid.framework import _test_eager_guard from predictor_utils import PredictorTools @@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST): np.allclose(dygraph_loss, static_loss), msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, static_loss)) + with _test_eager_guard(): + dygraph_loss = self.train_dygraph() + static_loss = self.train_static() + self.assertTrue( + np.allclose(dygraph_loss, static_loss), + msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, + static_loss)) def test_mnist_declarative_cpu_vs_mkldnn(self): dygraph_loss_cpu = self.train_dygraph() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py index 8ff68a1ce0d69307547db2fd1f83526094c9bfcf..91c340c35d478d9576dcc3f1b15d4d2300692c5a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py @@ -19,6 +19,7 @@ import unittest import os import numpy as np import random +import socket import paddle import paddle.nn as nn @@ -31,13 +32,26 @@ from paddle.optimizer import SGD from paddle.fluid.initializer import NumpyArrayInitializer +def net_is_used(port, ip='127.0.0.1'): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((ip, port)) + s.shutdown(2) + return True + except Exception as e: + return False + + def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks) - group = core.ProcessGroupNCCL(store, rank, nranks) - return group + for port in range(20000, 21000): + if not net_is_used(port): + store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master, + nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group class LinearModel(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 27aec284de4cdebb5ebb9191bfb67d48c1b327f5..98ef339e04535bb943add02b6cf6efe490f0354b 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase): out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) - self.assertTrue(data_eager.grad._is_initialized()) + self.assertIsNotNone(data_eager.grad) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) def test_retain_grad_and_run_backward_raises(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index cd4ba5b054264afca65d4c4d8359eb1854fbb658..7436e9eb7b12623296d7a714e742cc4212c4ca91 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101 import unittest from unittest import TestCase import numpy as np +import paddle.compat as cpt +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'): return fluid.dygraph.to_variable(x_np) +class TestEagerGrad(TestCase): + def func_simple_example_eager_grad(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + out = paddle.matmul(x, y) + dx = fluid.dygraph.grad(out, x) + + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + + def test_simple_example_eager_grad(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad() + self.func_simple_example_eager_grad() + + def func_simple_example_eager_grad_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + dx = fluid.dygraph.grad(out, [x, z], allow_unused=True) + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + # x is unused input in the graph + self.assertEqual(dx[1], None) + + def test_simple_example_eager_grad_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_allow_unused() + self.func_simple_example_eager_grad_allow_unused() + + def func_simple_example_eager_grad_not_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + try: + # allow_unused is false in default + dx = fluid.dygraph.grad(out, [x, z]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_simple_example_eager_grad_not_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_not_allow_unused() + self.func_simple_example_eager_grad_not_allow_unused() + + class TestDygraphDoubleGrad(TestCase): def setUp(self): self.sort_sum_gradient = False @@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_example_no_grad_vars(self): + x = random_var(self.shape) + x_np = x.numpy() + numel = x_np.size + x.stop_gradient = False + + y1 = fluid.layers.relu(x) + y2 = fluid.layers.relu(x) + z = y1 + y2 + w = z * z + + w_mean = fluid.layers.reduce_mean(w) + del y1, z, w + + dx_actual, = self.grad( + [w_mean], [x], create_graph=True, no_grad_vars=[y2]) + + self.assertFalse(y2.stop_gradient) + self.assertFalse(dx_actual.stop_gradient) + + dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * + (x_np > 0) * 2).astype('float32') + + self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) + + def test_example_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_no_grad_vars() + self.func_example_no_grad_vars() + + @dygraph_guard + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -214,25 +337,33 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward(retain_graph=True) - - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - - for i in range(5): + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss.backward(retain_graph=True) + x_grad_actual = x.gradient() - x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_grad_expected = (2.0 / float(numel) * ( x_np + dx_expected * (x_np > 0) * 2 / float(numel))).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + for i in range(5): + loss.backward(retain_graph=True) + x_grad_actual = x.gradient() + x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() + @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -289,12 +428,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): @@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): class TestDygraphDoubleGradVisitedUniq(TestCase): - def test_compare(self): + def func_compare(self): value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32") @@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): self.assertTrue(np.array_equal(grad_1, grad_2)) + def test_compare(self): + with _test_eager_guard(): + self.func_compare() + self.func_compare() + class TestRaiseNoDoubleGradOp(TestCase): def raise_no_grad_op(self): diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index 2ffe523ef6dda18a24813e702a1892c335ba6a68..531e9663a2b728a2871dff404425b063a0c47e67 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ import unittest from unittest import TestCase import numpy as np import paddle +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() + + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -279,12 +312,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py index f06c45cc369737403025ed264815a98b81acc6da..7c0c71951aa1d7a566cabf73ecb9d26e03b8dab6 100644 --- a/python/paddle/static/input.py +++ b/python/paddle/static/input.py @@ -193,7 +193,7 @@ class InputSpec(object): print(x_spec) # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x) """ - if isinstance(tensor, (Variable, core.VarBase)): + if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)): return cls(tensor.shape, tensor.dtype, name or tensor.name) else: raise ValueError( diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index d91b76bb70314a2d516b8a384cf3406b7f9e4d0d..bf3d7b3d19eab806706f1d2d654957aac5b33434 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -698,7 +698,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag) api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" -{code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( +{code_indent} const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} VLOG(6) << "{self.api} API kernel: " << kernel; diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index f754767259563f2cd64bac92adf76249b18af11f..861b31941200fd8a7482482cb683ff969bd05a18 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> { let arguments = (ins PD_Tensor :$inputs, StrAttr:$name); } -def PD_ReturnOp : PD_Op<"return", [Terminator]> { - let summary = "return Op"; - - let description = [{ - Fetch tensor from the graph. - }]; - - let arguments = (ins Variadic:$inputs); -} - def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "paddle graph Op"; let description = [{ diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 027dfe4328a55ff246928cbc9ab6d3d36f15e1fd..8855e1eee38717a6cffc14e9c1762af36e94fa84 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -16,8 +16,6 @@ import paddle.fluid.framework as framework from paddle.fluid import core from paddle import compat as cpt -ops_having_canonicalization = {"elementwise_add", } - # collect original ops: op which has both inference and grid defination def get_original_ops(): @@ -186,7 +184,7 @@ def generate_all_ops_inputs_outputs_map(op_descs): cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};" # 3. Write to header file - dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h" + dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h" with open(dst_head_file, 'w') as ops_inputs_outputs_head_file: ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str) ops_inputs_outputs_head_file.write("\n\n") @@ -195,7 +193,7 @@ def generate_all_ops_inputs_outputs_map(op_descs): # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): - dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td" + dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td" custom_dialect_file = "custom_pdop.td" # 1. Head files @@ -214,7 +212,7 @@ def convert_op_proto_into_mlir(op_descs): "include \"mlir/Interfaces/InferTypeOpInterface.td\"", "include \"mlir/Interfaces/LoopLikeInterface.td\"", "include \"mlir/IR/OpBase.td\"", - "include \"paddle/infrt/dialect/pd_op_base.td\"", + "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"", "", ] @@ -245,7 +243,6 @@ def convert_op_proto_into_mlir(op_descs): op_type=op_type, left_brace="{") SUMMARY = ' let summary = "{} op";\n'.format(op_type) - CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else "" # 2.2 Description contents = "" @@ -348,7 +345,6 @@ def convert_op_proto_into_mlir(op_descs): ops_mlir_file.write(DESCRIPTION) ops_mlir_file.write(ARGUMENTS) ops_mlir_file.write(RESULTS) - ops_mlir_file.write(CANONICALIZATION) ops_mlir_file.write("}\n") print("Skipped ops num: " + str(len(skipped_op_list)))