提交 4be77e53 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_temporal_shift_to_phi

......@@ -52,12 +52,12 @@ tools/__pycache__
# This file is automatically generated.
# TODO(zhiqiang) Move this file to build directory.
paddle/infrt/dialect/pd_ops.td
paddle/infrt/dialect/pd/ir/pd_ops.td
paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
tools/infrt/kernels.json
tools/infrt/kernel_signature.json
paddle/infrt/dialect/pd_ops_info.h
paddle/infrt/dialect/pd/common/pd_ops_info.h
.lit_test_times.txt
paddle/infrt/tests/dialect/Output
paddle/infrt/tests/lit.cfg.py
......
......@@ -24,7 +24,7 @@
#include "paddle/fluid/platform/errors.h"
#include "glog/logging.h"
DECLARE_bool(retain_grad_for_all_tensor);
namespace egr {
static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
......@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal(
......@@ -62,7 +62,7 @@ operator()(
grad_out = grads[0][0];
}
if (!weak_grad_.expired()) {
if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
auto grad = weak_grad_.lock();
CopyOrAddTensor(grad.get(), grad_out);
}
......
......@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
std::string name() { return "GradNodeAccumulation"; }
......
......@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
// 1. Check Output Size
PADDLE_ENFORCE(
((grads.size() == 1) && (grads[0].size() == 1)),
......
......@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetTensorWrappers_X(
const std::vector<paddle::experimental::Tensor>& tensors);
......
......@@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents(
const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
"bool create_graph) {\n%s\n}";
std::string grad_function_str = paddle::string::Sprintf(
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
......@@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents(
"\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph = false) "
"override;\n"
"\n"
" void ClearTensorWrappers() override { \n"
"%s\n"
" is_tensor_wrappers_cleared = true;\n"
" }\n"
" std::string name() override { return \" GradNode%s \"; } \n "
"\n"
" // SetX, SetY, ...\n"
"%s\n"
" // SetAttrMap\n"
"%s\n"
" bool IsTensorWrappersCleared() override { \n"
" return is_tensor_wrappers_cleared;\n"
" }\n"
" private:\n"
" // TensorWrappers\n"
"%s\n"
" bool is_tensor_wrappers_cleared = false;\n"
"\n"
" // Attribute Map\n"
"%s\n"
"};";
......@@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents(
std::string set_tensor_wrappers_str = "";
std::string tensor_wrapper_members_str = "";
std::string clear_tensor_wrappers_str = "";
for (const auto& iter : op_base_infos) {
const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
iter.GetGradInsFwdSlotnameMap();
......@@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
struct_tensor_wrapper_name);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
"for (auto tw: %s) {\n"
" tw.clear();\n"
" }\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
} else {
const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
"const paddle::experimental::Tensor& %s";
......@@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents(
TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
tensor_wrapper_body_str = paddle::string::Sprintf(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
tensor_wrapper_name, full_reserved_str);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
}
std::string full_reserved_signature_str = "bool full_reserved";
const char* SET_TENSOR_WRAPPER_TEMPLATE =
......@@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents(
std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
tensor_wrapper_members_str, attr_members_str);
op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
return grad_node_str;
}
......
......@@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
# SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
......@@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
"""
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
{}.clear();
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
else:
assert IsVectorTensorType(ttype)
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
......@@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
"""
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
for (auto tw: {}) {
tw.clear();
};
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
# End: SetTensorWrapper Methods & TensorWrapper Members
# SetAttributes & Attribute Members
......@@ -524,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
for aname, atype, default_val, _ in backward_attrs_list:
saved_attr_name = GetSavedName(aname)
SET_ATTR_METHOD_TEMPLATE = """
void SetAttribute{}({} {}) {{
void SetAttribute{}({} {}) {{
{} = {};
}}
"""
......@@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private:
// TensorWrappers
{}
bool is_tensor_wrappers_cleared = false;
// Attributes
{}
}};
"""
node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, set_tensor_wrapper_methods_str,
set_attribute_methods_str, tensor_wrapper_members_str,
attribute_members_str)
grad_node_name, clear_tensor_wrapper_str,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
return node_declaration_str
......@@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
// Call grad_api function
auto grad_api_returns = {}::{}({});
{}
......
......@@ -39,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
size_t potential_startup_ops_cnt = queue.size();
size_t cnt = 0;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (cnt < potential_startup_ops_cnt) {
if (!node_in_degree_map.count(node)) {
node_in_degree_map[node] = 0;
}
cnt += 1;
}
if (visited.count(node)) {
continue;
}
......@@ -76,23 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
return node_in_degree_map;
}
void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) {
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
// Remove some nodes those doesn't need to be
// stored in potential_stop_nodes、potential_startup_nodes
void UpdateGraphInfo(
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map,
std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
// Updated potential_sotp_nodes by depending_nodes,
// make sure the path from root to target_node is ok
std::unordered_set<GradNodeBase*> _startup_ops;
VLOG(6) << "Running in UpdateGraphInfo";
std::queue<GradNodeBase*> queue;
for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
queue.emplace(target_nodes_inputmeta_pair.first);
}
while (!queue.empty()) {
auto* target_node = queue.front();
queue.pop();
if (!(*depending_nodes)[target_node].empty()) {
auto precedding_nodes = (*depending_nodes)[target_node];
for (auto pre_nodes : precedding_nodes) {
queue.emplace(pre_nodes);
if (potential_stop_nodes->find(pre_nodes) !=
potential_stop_nodes->end()) {
potential_stop_nodes->erase(pre_nodes);
}
}
} else { // startup_ops have no precedding nodes
VLOG(6) << "Emplace _startup_ops";
_startup_ops.emplace(target_node);
}
}
// Purify potential_startup_nodes again, remove some
// potential startup_nodes that unreach to input target nodes
if (!_startup_ops.empty()) {
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto node : *potential_startup_nodes) {
if (_startup_ops.count(node) == 0) {
VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
potential_startup_nodes_to_be_erased.emplace(node);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto node : potential_startup_nodes_to_be_erased) {
VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
potential_startup_nodes->erase(node);
}
}
}
}
// Get Graph Info Betweent input target gradnode and outputs,
// record depending_nodes、 potential_stop_nodes、potential_startup_nodes
void GetGraphInfoBetweenTargets(
const std::queue<GradNodeBase*>& init_queue,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
input_target_nodes_inputmeta_map,
std::unordered_map</*child node*/ GradNodeBase*,
/*father nodes*/ std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
if (input_target_nodes_inputmeta_map->empty()) return;
VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
// Calculate in_degree for each node
std::unordered_map<GradNodeBase*, int> node_in_degree_map;
// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (visited.count(node)) {
continue;
}
visited.insert(node);
// Check node is target_nodes or not, if node is not target_node,
// all the next_node will be marked in potential_stop_nodes
bool is_potential_stop_nodes =
input_target_nodes_inputmeta_map->count(node);
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
if (!next_node) continue;
// if node not in input_target_nodes,
// all the next_nodes of current node will be inserted to
// potential_stop_node
if (is_potential_stop_nodes) {
potential_stop_nodes->emplace(next_node);
}
// Update in_degree
if (!node_in_degree_map.count(next_node))
node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++;
// Record depending relationship
(*depending_nodes)[next_node].emplace(node);
queue.push(next_node);
}
}
}
// Update Graph Info, remove some stop_node in potential_stop_nodes
UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
potential_stop_nodes, potential_startup_nodes);
}
void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map) {
VLOG(6) << "Running in GetTargetNodesInfo";
if (!inputs.empty()) {
VLOG(6) << "Inputs are not empty";
size_t num_inputs = inputs.size();
for (size_t i = 0; i < num_inputs; i++) {
AutogradMeta* auto_grad_meta =
EagerUtils::unsafe_autograd_meta(inputs[i]);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
PADDLE_ENFORCE_NOT_NULL(target_node,
paddle::platform::errors::Fatal(
"There is no grad op for input:%d or it's"
"stop_gradient=True",
i));
(*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
}
}
}
std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
results_map,
bool allow_unused, bool create_graph) {
VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {};
std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
auto iter = results_map->find(target_node);
if (iter != results_map->end()) {
// set StopGradient = !create_graph
AutogradMeta* tensor_auto_grad_meta =
EagerUtils::autograd_meta(&(iter->second));
tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input variable or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
}
}
return results;
}
// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
VLOG(6) << "Running in EnforceGradNodeHasInput";
PADDLE_ENFORCE_NE(
node->IsTensorWrappersCleared(), true,
paddle::platform::errors::Fatal(
"The TensorWrappers of %s do not exist. This may be because:\n"
"You calculate backward twice for the same subgraph without "
"setting retain_graph=True. Please set retain_graph=True in the "
"first backward/grad call.\n",
node->name()));
}
// Purify potential_startup_nodes, remove nodes those are the same as
// input_target_nodes
void PurifyPotentialStartUpNodes(
std::unordered_set<GradNodeBase*>* potential_startup_nodes,
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>*
input_target_nodes_inputmeta_map) {
VLOG(6) << "Running in PurifyPotentialStartUpNodes";
if (input_target_nodes_inputmeta_map->empty()) return;
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto startup_op : *potential_startup_nodes) {
auto iter = input_target_nodes_inputmeta_map->find(startup_op);
if (iter != input_target_nodes_inputmeta_map->end()) {
potential_startup_nodes_to_be_erased.emplace(iter->first);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto nodes : potential_startup_nodes_to_be_erased) {
potential_startup_nodes->erase(nodes);
}
}
}
std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph = false,
const std::vector<paddle::experimental::Tensor>& inputs = {},
bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
VLOG(6) << "Start Backward";
// *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass
std::unordered_map<GradNodeBase*, AutogradMeta*>
no_grad_var_nodes_inputmeta_map;
// Get no_grad_vars's GradNodes and InputMeta Info
GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
/* --- Initialization --- */
// 1. Init queue with starting nodes
// 2. Prepare initial input buffers
std::queue<GradNodeBase*> queue;
std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
node_input_buffers_dict;
std::unordered_set<GradNodeBase*> potential_startup_nodes;
for (size_t i = 0; i < tensors.size(); i++) {
const paddle::experimental::Tensor& tensor = tensors[i];
......@@ -132,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
"size = 0 or same size as tensors"));
// Feed given tensor if it's provided
VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
node_input_buffers_dict[grad_node]->add(
input_info.first, input_info.second, grad_tensors[i]);
if (grad_tensors[i].is_initialized()) {
// Deep copy
paddle::experimental::Tensor tmp_tensor;
tmp_tensor.copy_(grad_tensors[i], true);
node_input_buffers_dict[grad_node]->add(input_info.first,
input_info.second, tmp_tensor);
} else {
node_input_buffers_dict[grad_node]->add(
input_info.first, input_info.second, grad_tensors[i]);
}
} else {
VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
......@@ -146,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
input_info.first, input_info.second, tensor, true /*fill_one=true*/);
}
// Prepare queue
// Prepare queue, potential startup_nodes
queue.push(grad_node);
potential_startup_nodes.emplace(grad_node);
}
VLOG(6) << "Update In degree Map for backward";
......@@ -155,25 +399,74 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unordered_map<GradNodeBase*, int> node_in_degree_map =
getInDegreeMap(queue);
// Get input's GradNodes and InputMeta Info
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
input_target_nodes_inputmeta_map;
GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map);
// Purify potential_startup_ops, remove those nodes that are the same as
// input_target_nodes
PurifyPotentialStartUpNodes(&potential_startup_nodes,
&input_target_nodes_inputmeta_map);
// Get Graph Info Betweent input target gradnode and outputs
// Record the depending_nodes and potential_stop_nodes
std::unordered_map<GradNodeBase* /* child node */,
std::unordered_set<GradNodeBase*> /* father node */>
depending_nodes;
std::unordered_set<GradNodeBase*> potential_stop_nodes;
// std::unordered_set<GradNodeBase*> startup_ops;
GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
&depending_nodes, &potential_stop_nodes,
&potential_startup_nodes);
// ready_queue store all startup nodes
std::queue<GradNodeBase*> ready_queue;
// startup op's indegree should be 0
for (auto node : potential_startup_nodes) {
if (node_in_degree_map[node] == 0) {
ready_queue.emplace(node);
}
}
VLOG(1) << " startup_ops' size is :" << ready_queue.size();
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
// read_queue is empty only when 1.input equals to output. 2.input can not
// reach to output.
if (ready_queue.size() == 0) {
for (auto input_target_node : input_target_nodes_inputmeta_map) {
// out rank_info of forward op
auto rank_info = input_target_node.second->OutRankInfo();
if (node_input_buffers_dict[input_target_node.first]) {
auto& target_result =
node_input_buffers_dict[input_target_node.first]
->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[input_target_node.first] = target_result;
}
}
}
/* --- Topological Visit --- */
// 1. Pop queue
// 2. Run node
// |- Check and capture target result
// |- node(grads)
// |- Prepare for next node
// 3. Update queue
VLOG(6) << "Run Backward";
while (!queue.empty()) {
GradNodeBase* node = queue.front();
while (!ready_queue.empty()) {
GradNodeBase* node = ready_queue.front();
VLOG(6) << "Running GradNode:" << node->name();
ready_queue.pop();
paddle::platform::RecordEvent node_record_event(
std::string(typeid(*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop();
continue;
}
queue.pop();
// Run node: This is where Hook happens
PADDLE_ENFORCE(
node_input_buffers_dict.count(node),
......@@ -184,10 +477,45 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unique_ptr<GradTensorHolder> node_input_buffer =
std::move(node_input_buffers_dict[node]);
// get target grad_var from node_input_buffer by inputmeta
if (input_target_nodes_inputmeta_map.find(node) !=
input_target_nodes_inputmeta_map.end()) {
VLOG(6) << "Get target result by by inputmeta";
// out rank_info of forward op
auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
// rank_info is a pair, first means slot_id, second means rank.
auto& target_result =
node_input_buffer->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[node] = target_result;
}
// no_grad_vars
if (no_grad_var_nodes_inputmeta_map.find(node) !=
no_grad_var_nodes_inputmeta_map.end()) {
VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo();
node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
rank_info.second);
}
VLOG(6) << "Running GradNode:" << node->name();
// check input
EnforceGradNodeHasInput(node);
VLOG(6) << "Run Backward Kernel with GradTensorHolder";
// Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers());
(*node)(node_input_buffer->Buffers(), create_graph);
// retain_grad or not
if (!retain_graph) {
VLOG(6)
<< "retain_graph is false, need to clear the TensorWrapper of nodes.";
node->ClearTensorWrappers();
}
// TODO(jiabin): Should we erase it or find a more efficient way.
node_input_buffers_dict.erase(node);
......@@ -252,18 +580,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
// Update queue
node_in_degree_map[next_node]--;
PADDLE_ENFORCE(
node_in_degree_map[next_node] >= 0,
paddle::platform::errors::Fatal(
"Detected in-degree value smaller than zero. For Node: %s"
"Node's in-degree cannot be negative",
next_node->name()));
if (node_in_degree_map[next_node] == 0) {
queue.emplace(std::move(next_node));
bool is_potential_stop_node = potential_stop_nodes.count(next_node);
if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
ready_queue.emplace(std::move(next_node));
}
}
}
}
return GetResults(inputs, &results_map, allow_unused, create_graph);
}
void Backward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) {
VLOG(6) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
RunBackward(tensors, grad_tensors, retain_graph);
}
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad";
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
allow_unused, no_grad_vars);
}
} // namespace egr
......@@ -19,12 +19,20 @@
namespace egr {
// run_backward():
// Backward():
// tensors corresponds to those lived in the backward graph
// each grad_tensors[i] keeps the value for its corresponding tensors[i]
void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
const std::vector<paddle::experimental::Tensor> &grad_tensors,
bool retain_graph = false);
void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph = false);
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
bool retain_graph = false, bool create_graph = false,
bool only_inputs = false, bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
// Reserved for gradient()
......
......@@ -20,8 +20,8 @@
namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
......
......@@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) override;
std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
......@@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase {
return res;
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
public:
......
......@@ -95,8 +95,12 @@ class GradNodeBase {
* is better choice to fit this format.
* **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) = 0;
virtual void ClearTensorWrappers() = 0;
virtual bool IsTensorWrappersCleared() = 0;
/**
* AddEdges is designed to set input tensors' backward Node as current
* node's Edges.
......
......@@ -21,6 +21,11 @@
namespace egr {
void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
buffer_[slot_id][rank] =
paddle::experimental::zeros_like(buffer_[slot_id][rank]);
}
void GradTensorHolder::add(size_t slot_id, size_t rank,
const paddle::experimental::Tensor& t,
bool fill_one) {
......
......@@ -56,6 +56,8 @@ class GradTensorHolder {
return buffer_;
}
void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
private:
std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
};
......
......@@ -98,6 +98,8 @@ class TensorWrapper {
}
}
void clear() { intermidiate_tensor_.reset(); }
private:
bool full_reserved_ = false;
std::pair<size_t, size_t> out_rank_info_;
......
......@@ -17,6 +17,14 @@
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
#endif
namespace eager_test {
using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
......@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
t4.reset();
CHECK(t4.defined() == false);
VLOG(6) << "Check Tensor Copy_";
std::vector<int64_t> rows = {1, 2};
std::vector<int64_t> dims = {2};
paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
->mutable_value()
->Resize(phi::make_ddim(dims));
auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
->mutable_value()
->mutable_data<float>(paddle::platform::CPUPlace());
dt7_tmp_ptr[0] = 6.0f;
dt7_tmp_ptr[1] = 11.0f;
paddle::experimental::Tensor t8;
paddle::experimental::Tensor t5;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::experimental::Tensor t6;
paddle::experimental::Tensor t9;
VLOG(6) << "Check Tensor Copy_ Selected Rows";
t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
t9.copy_(t8, paddle::platform::CPUPlace(), true);
auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
->value()
.data<float>();
CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
2);
VLOG(6) << "Check Tensor Copy_ Dense Tensor";
t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
t6.copy_(t5, paddle::platform::CPUPlace(), true);
auto* dt6_tmp_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
#else
t5.copy_(t3, paddle::platform::CPUPlace(), true);
auto* dt5_tmp_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
#endif
VLOG(6) << "Finish";
}
......@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override {
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0];
phi::DenseTensorMeta meta =
......@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
return res;
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
float val_;
};
} // namespace eager_test
......@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 10)
......@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2)
......@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2)
......@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
std::vector<paddle::experimental::Tensor> target_tensors = {Out};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
std::unordered_map<std::string, float> result =
......
......@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
......
......@@ -33,6 +33,7 @@
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr {
......@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
}
std::vector<paddle::experimental::Tensor> outs = {target_tensor};
// Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
......@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
}
// Run Backward
RunBackward(target_tensors, grad_tensors);
Backward(target_tensors, grad_tensors);
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
......@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
}
// Use Empty Grad Tensor
RunBackward(target_tensors, {});
Backward(target_tensors, {});
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
......@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
node2_ptr->AddEdges(&res2, 0);
}
RunBackward(target_tensors, grad_tensors);
Backward(target_tensors, grad_tensors);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
}
......
......@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
std::vector<egr::AutogradMeta*> res = {meta};
scale_node_ptr->AddEdges(&res, 0);
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
......
......@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
VLOG(7) << "Target Grad is: "
<< std::static_pointer_cast<phi::DenseTensor>(
......@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
std::vector<paddle::experimental::Tensor> outs = {out1};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
......@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
......@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
// leaf grad
......@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
// Cross Batch Accumulation
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
......@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
......@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
// TODO(jiabin): fix this with add functor
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
......
......@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) {
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {});
Backward(target_tensors, {});
VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
......@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) {
eager_test::CompareTensorWithValue<float>(output_tensor, 96);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
......@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
eager_test::CompareTensorWithValue<float>(output_tensor, 5);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sstream>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr {
TEST(Grad, SingleNodeEmptyGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor (output)
paddle::experimental::Tensor output_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
// Create input tensor
const paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Output_tensor set GradNode、OutRank、StopGradient propertis
AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Get autograd_meta from input tensor
AutogradMeta* auto_grad_meta1 =
EagerUtils::unsafe_autograd_meta(leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
// input tensor set GradNode、OutRank、StopGradient propertis
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// grad_node Add Edges
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
std::vector<paddle::experimental::Tensor> outs = {output_tensor};
// Run Grad
auto result = Grad(outs, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 5.0);
}
TEST(Grad, SingleNodeCustomGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
std::vector<paddle::experimental::Tensor> grad_tensors;
// Create Grad Tensor
paddle::experimental::Tensor grad_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Connect Tensor and Node via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node1
|
Node0
|
{ } // empty grad tensor
*/
TEST(Grad, LinearNodes) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta for node0
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
// Set grad in/out meta for node1
node1_ptr->SetDefaultGradInOutMeta();
// Connect Input Tensor and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Connect Node0 -> Node1 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node1_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
node1_ptr->AddEdges(&res1, 0);
}
// Use Empty Grad Tensor
auto result = Grad(target_tensors, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node2
| |
Node0 Node1
| |
in0 in1
*/
TEST(Grad, WithAccumulation) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor0));
target_tensors.emplace_back(std::move(tensor1));
// Create Grad Tensor
std::vector<paddle::experimental::Tensor> grad_tensors;
paddle::experimental::Tensor grad_tensor0 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor grad_tensor1 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor0));
grad_tensors.emplace_back(std::move(grad_tensor1));
paddle::experimental::Tensor leaf_tensor;
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
node1_ptr->SetDefaultGradInOutMeta();
// Create Node2
auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
node2_ptr->SetAttributes_scale(20.0 /*scale*/);
node2_ptr->SetDefaultGradInOutMeta();
// Connect Inp0 and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta0 =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta0->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta0->SetStopGradient(false);
// Connect Inp1 and Node1 via AutoGradMeta
AutogradMeta* auto_grad_meta1 =
EagerUtils::autograd_meta(&(target_tensors[1]));
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// Connect Node0 -> Node2 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
// Connect Node1 -> Node2 via Edge
auto meta1 = egr::AutogradMeta();
meta1.SetStopGradient(false);
meta1.SetSingleOutRankWithSlot(0, 0);
meta1.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res1 = {&meta1};
node1_ptr->AddEdges(&res1, 0);
AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
auto_grad_meta2->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta2->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
node2_ptr->AddEdges(&res2, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
}
} // namespace egr
......@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
......@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
}
......
......@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
}
VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {});
Backward(target_tensors, {});
VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(
......@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(
......@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(
......
......@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
~GradNodeRunProgram() override = default;
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
override {
const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
bool create_graph) override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ(
grads.size(), 1,
......@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
// SetAttrMap
void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
attrs_ = attrs;
......
......@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
pass_library(add_support_int8_pass inference)
pass_library(matmul_scale_fuse_pass inference)
pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
pass_library(mixed_precision_configure_pass inference)
pass_library(generate_pass DEPS pass_desc_proto)
target_link_libraries(generate_pass pass_desc_proto)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace ir {
void MixedPrecisionConfigurePass::InsertCastOps(
Graph* graph, const StringSet& blacklist) const {
VLOG(3) << "Insert the cast op before and after the kernel that does not "
"supports fp16 precision";
auto update_cast_desc = [&](
framework::OpDesc& desc, const std::string& x_name,
const std::string& out_name, const int in_dtype, const int out_dtype) {
desc.SetType("cast");
desc.SetInput("X", {x_name});
desc.SetOutput("Out", {out_name});
desc.SetAttr("in_dtype", in_dtype);
desc.SetAttr("out_dtype", out_dtype);
desc.SetAttr("use_mkldnn", false);
desc.SetAttr("with_quant_attr", false);
desc.Flush();
};
auto cast_input = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto inlinks = op_node->inputs;
for (auto* pre_node : inlinks) {
if (pre_node->IsVar()) {
const auto is_persistable = pre_node->Var()->Persistable();
const auto is_float =
pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
pre_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* pre_node_input : pre_node->inputs) {
if (!pre_node_input->IsOp()) continue;
const auto& type = pre_node_input->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = pre_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
op_node->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(pre_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, op_node);
}
}
}
}
}
};
auto cast_output = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto outlinks = op_node->outputs;
for (auto* next_node : outlinks) {
if (next_node->IsVar()) {
const auto is_persistable = next_node->Var()->Persistable();
const auto is_float =
next_node->Var()->GetDataType() == proto::VarType::FP16 ||
next_node->Var()->GetDataType() == proto::VarType::FP32 ||
next_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* next_node_output : next_node->outputs) {
if (!next_node_output->IsOp()) continue;
const auto& type = next_node_output->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = next_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
next_node_output->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(next_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, next_node_output);
}
}
}
}
}
};
for (auto* op_node :
ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
const auto& type = op_node->Op()->Type();
if (blacklist.count(type)) {
cast_input(graph, op_node, blacklist);
cast_output(graph, op_node, blacklist);
}
}
}
void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
const auto blacklist =
Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
InsertCastOps(graph, blacklist);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mixed_precision_configure_pass,
paddle::framework::ir::MixedPrecisionConfigurePass);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
using StringSet = std::unordered_set<std::string>;
class MixedPrecisionConfigurePass : public FusePassBase {
public:
MixedPrecisionConfigurePass() = default;
virtual ~MixedPrecisionConfigurePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
private:
void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -188,6 +188,9 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
std::unordered_set<std::string>);
// Usually use for trt dynamic shape.
// TRT will select the best kernel according to opt shape
......
......@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
new int(argument->dlnne_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
} else if (pass_name == "mixed_precision_configure_pass") {
pass->Set("gpu_fp16_disabled_op_types",
new std::unordered_set<std::string>(
argument->gpu_fp16_disabled_op_types()));
}
if (pass_name == "lite_subgraph_pass") {
bool lite_enable_int8 =
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
#else
void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
const framework::ir::Graph &graph,
std::unordered_map<std::string, std::string> *var_name_op_type_map) {
std::vector<framework::ir::Node *> node_list =
framework::ir::TopologyVarientSort(
graph, static_cast<framework::ir::SortKind>(0));
for (auto *op_node : node_list) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
for (auto *pre_node : op_node->inputs) {
if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
var_name_op_type_map->insert(std::pair<std::string, std::string>(
pre_node->Var()->Name(), op_node->Op()->Type()));
}
}
}
}
void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
......@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
if (with_dynamic_shape) {
reserve_cpu_weights = true;
}
bool mixed_precision_mode =
argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
std::unordered_map<std::string, std::string> var_name_op_type_map{};
std::unordered_set<std::string> blacklist{};
if (mixed_precision_mode) {
GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
blacklist = argument->gpu_fp16_disabled_op_types();
}
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
......@@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>();
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);
// Copy the parameter data to a tmp tensor.
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->clear();
// Copy parameter data to newly allocated GPU space.
paddle::framework::TensorCopySync(temp_tensor, place, t);
bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
t->dtype() == paddle::experimental::DataType::FLOAT64;
if (mixed_precision_mode &&
!blacklist.count(var_name_op_type_map[var_name]) && is_float) {
framework::Tensor half_tensor;
half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
half_tensor.Resize(t->dims());
auto *half_data =
half_tensor.mutable_data<float16>(platform::CPUPlace());
for (int i = 0; i < t->numel(); i++) {
auto *data = t->mutable_data<float>(platform::CPUPlace());
half_data[i] = static_cast<float16>(data[i]);
}
t->clear();
paddle::framework::TensorCopySync(half_tensor, place, t);
} else {
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
}
......
......@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
#ifdef PADDLE_WITH_ASCEND_CL
void CopyParamsToNpu(Argument *argument);
#else
void CopyParamsToGpu(Argument *argument);
void GetVarNameToOpTypeMap(
const framework::ir::Graph& graph,
std::unordered_map<std::string, std::string>* var_name_op_type_map);
void CopyParamsToGpu(Argument* argument);
#endif
};
......
......@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
Update();
}
void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
Update();
}
void AnalysisConfig::DisableGpu() {
use_gpu_ = false;
Update();
}
void AnalysisConfig::Exp_EnableUseGpuFp16(
std::unordered_set<std::string> op_list) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
use_gpu_fp16_ = true;
gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
#else
LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
use_gpu_fp16_ = false;
#endif
Update();
}
void AnalysisConfig::DisableFCPadding() {
use_fc_padding_ = false;
......@@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_cudnn_);
CP_MEMBER(gpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(use_gpu_fp16_);
CP_MEMBER(gpu_fp16_disabled_op_types_);
CP_MEMBER(enable_memory_optim_);
// TensorRT related.
......@@ -573,6 +590,20 @@ void AnalysisConfig::Update() {
#endif
}
if (use_gpu_fp16_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) {
LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
"enabled.";
} else if (!use_gpu()) {
LOG(ERROR)
<< "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
} else {
pass_builder()->Exp_EnableUseGpuFp16();
}
#endif
}
if (use_mkldnn_) {
#ifdef PADDLE_WITH_MKLDNN
if (!enable_ir_optim_) {
......@@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << params_file_;
ss << use_gpu_;
ss << use_gpu_fp16_;
for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
ss << use_fc_padding_;
ss << gpu_device_id_;
ss << xpu_device_id_;
......
......@@ -872,6 +872,11 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
}
if (config_.gpu_fp16_enabled()) {
argument_.SetUseGPUFp16(true);
argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
}
if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
......
......@@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
ASSERT_TRUE(!config.use_onnxruntime());
}
TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
AnalysisConfig config;
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
ASSERT_TRUE(config.gpu_fp16_enabled());
#else
config.DisableGpu();
#endif
LOG(INFO) << config.Summary();
}
} // namespace paddle
namespace paddle_infer {
......@@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) {
auto predictor = CreatePredictor(config);
}
TEST(Predictor, Exp_EnableUseGpuFp16) {
Config config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
#else
config.DisableGpu();
#endif
auto predictor = CreatePredictor(config);
}
TEST(Tensor, CpuShareExternalData) {
Config config;
config.SetModel(FLAGS_dirname);
......
......@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
void DisableGpu();
///
/// \brief Enable GPU fp16 precision computation, in experimental state.
///
/// \param op_list The operator type list.
///
void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
///
/// \brief A boolean state telling whether the GPU fp16 precision is turned
/// on.
///
/// \return bool Whether the GPU fp16 precision is turned on.
///
bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
///
/// \brief Turn on XPU.
......@@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
int gpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool thread_local_stream_{false};
bool use_gpu_fp16_{false};
std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
"conv2d_fusion", "conv2d", "roll", "strided_slice"};
bool use_cudnn_{false};
......
......@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
use_cudnn_ = true;
}
void GpuPassStrategy::Exp_EnableUseGpuFp16() {
passes_.assign({
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"embedding_eltwise_layernorm_fuse_pass", //
"multihead_matmul_fuse_pass_v2", //
"gpu_cpu_squeeze2_matmul_fuse_pass", //
"gpu_cpu_reshape2_matmul_fuse_pass", //
"gpu_cpu_flatten2_matmul_fuse_pass", //
"gpu_cpu_map_matmul_v2_to_mul_pass", //
"gpu_cpu_map_matmul_v2_to_matmul_pass", //
"gpu_cpu_map_matmul_to_mul_pass", //
// "fc_fuse_pass", //
"fc_elementwise_layernorm_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
// disable the pass.
#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
#endif
"conv_elementwise_add_fuse_pass", //
#endif //
"transpose_flatten_concat_fuse_pass", //
"mixed_precision_configure_pass", //
"runtime_context_cache_pass" //
});
use_gpu_fp16_ = true;
}
void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
......
......@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable the use of cuDNN kernel.
virtual void EnableCUDNN() {}
/// \brief Enable use gpu fp16 kernel.
virtual void Exp_EnableUseGpuFp16() {}
/// \brief Enable the use of MKLDNN.
/// The MKLDNN control exists in both CPU and GPU mode, because there can
/// still be some CPU kernels running in GPU mode.
......@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; }
/// \brief Check if we are using gpu fp16 kernel.
/// \return A bool variable implying whether we are in gpu fp16 mode.
bool use_gpu_fp16() const { return use_gpu_fp16_; }
/// \brief Check if we are using xpu.
/// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; }
......@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
bool use_npu_{false};
bool use_ipu_{false};
bool use_mkldnn_{false};
bool use_gpu_fp16_{false};
/// \endcond
};
......@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Enable the use of cuDNN kernel.
void EnableCUDNN() override;
/// \brief Enable the use of gpu fp16 kernel.
void Exp_EnableUseGpuFp16() override;
/// \brief Not supported in GPU mode yet.
void EnableMKLDNN() override;
......@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
protected:
/// \cond Protected
bool use_cudnn_{false};
bool use_gpu_fp16_{false};
/// \endcond
};
......
......@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
extern __shared__ char* shared_max_data_tmp[];
auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
if (gridDim.x > 1) {
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T tmp = abs(in[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
} else {
if (bid < n) {
shared_max_data[tid] = abs(in[bid]);
......@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
extern __shared__ T shared_max_data[];
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = tid; i < channel_size; i += blockDim.x) {
T tmp = fabs(in_c[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
......@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
int tid = threadIdx.x;
int bid = blockIdx.x;
const T* in_current = in + tid * cout_wh_size + bid * wh_size;
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = 0; i < wh_size; i++) {
T tmp = fabs(in_current[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
int len = blockDim.x;
......@@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
}
};
template <typename T>
__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
const T* in_accum,
const T* cur_scale, const T rate,
T* out_state, T* out_accum,
T* out_scale) {
T state = rate * (*in_state) + T(1.0f);
T accum = rate * (*in_accum) + (*cur_scale);
*out_state = state;
*out_accum = accum;
*out_scale = accum / state;
}
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
......@@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T accum;
T state;
T scale;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
ctx.stream());
ctx.Wait();
T rate_t = static_cast<T>(rate);
state = rate_t * state + static_cast<T>(1.0);
accum = rate_t * accum + scale;
scale = accum / state;
memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
platform::CPUPlace(), &state, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
ctx.Wait();
T* out_state_data = out_state->mutable_data<T>(gpu_place);
T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
out_state_data, out_accum_data, out_scale_data);
}
};
......
......@@ -15,9 +15,13 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
class GridSampleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
auto x_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
PADDLE_ENFORCE_EQ(x_dims.size(), 4,
platform::errors::InvalidArgument(
"Input(X) of GridSampleOp should be 4-D Tensor, but "
"received X dimension size(%d)",
x_dims.size()));
PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
platform::errors::InvalidArgument(
"Input(Grid) of GridSampleOp should be 4-D Tensor, "
"but received X dimension size(%d)",
grid_dims.size()));
if (ctx->IsRuntime() || grid_dims[3] > 0) {
PADDLE_ENFORCE_EQ(
grid_dims[3], 2,
platform::errors::InvalidArgument(
"Input(Grid) dimension[3] should be 2, but received %d",
grid_dims[3]));
}
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(
grid_dims[0], x_dims[0],
platform::errors::InvalidArgument(
"Input(X) and Input(Grid) dimension[0] should be equal, but "
"received X dimension[0](%d) != Grid dimension[0](%d)",
x_dims[0], grid_dims[0]));
}
ctx->SetOutputDim("Output",
{x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
ctx->ShareLoD("X", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
class GridSampleOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "grid_sampler");
auto input_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
}
if (ctx->HasOutput(framework::GradVarName("Grid"))) {
ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -224,10 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
PD_INFER_META(phi::GridSampleBaseInferMeta));
REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
ops::GridSampleGradMaker<paddle::framework::OpDesc>,
ops::GridSampleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
ops::GridSampleGradMaker<paddle::imperative::OpBase>,
GridSamplerInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
PD_INFER_META(phi::GeneralBinaryGradInferMeta));
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
GridSamplerGradInferShapeFunctor);
REGISTER_OP_VERSION(grid_sampler)
.AddCheckpoint(
......
......@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
egr::RunBackward(tensors, grad_tensors,
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
egr::Backward(tensors, grad_tensors,
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
Py_INCREF(Py_None);
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
std::vector<paddle::experimental::Tensor> result =
egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
only_inputs, allow_unused, no_grad_vars);
VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
return ToPyObject(result, true /* return_py_none_if_not_initialize */);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = {
METH_VARARGS | METH_KEYWORDS, NULL},
{"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
METH_VARARGS | METH_KEYWORDS, NULL},
{"run_partial_grad",
(PyCFunction)(void (*)(void))eager_api_run_partial_grad,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
METH_VARARGS | METH_KEYWORDS, NULL},
{"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
......
......@@ -226,6 +226,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto cp_tensor =
self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
egr::EagerUtils::autograd_meta(&cp_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(cp_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
......@@ -264,7 +277,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
}
self->tensor.copy_(src_tensor, blocking);
self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
<< self->tensor.name();
......
......@@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
"Detected NULL grad"
"Please check if you have manually cleared"
"the grad inside autograd_meta"));
grad->copy_(src, true);
grad->copy_(src, self->tensor.inner_place(), true);
return 0;
EAGER_CATCH_AND_THROW_RETURN_ZERO
}
......
......@@ -492,20 +492,26 @@ PyObject* ToPyObject(const std::vector<double>& value) {
return result;
}
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize) {
PyObject* result = PyList_New((Py_ssize_t)value.size());
for (size_t i = 0; i < value.size(); i++) {
PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
if (obj) {
auto v = reinterpret_cast<TensorObject*>(obj);
new (&(v->tensor)) paddle::experimental::Tensor();
v->tensor = value[i];
if (!value[i].initialized() && return_py_none_if_not_initialize) {
Py_INCREF(Py_None);
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
} else {
PADDLE_THROW(platform::errors::Fatal(
"tp_alloc return null, can not new a PyObject."));
PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
if (obj) {
auto v = reinterpret_cast<TensorObject*>(obj);
new (&(v->tensor)) paddle::experimental::Tensor();
v->tensor = value[i];
} else {
PADDLE_THROW(platform::errors::Fatal(
"tp_alloc return null, can not new a PyObject."));
}
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
}
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
}
return result;
......
......@@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector<int>& value);
PyObject* ToPyObject(const std::vector<int64_t>& value);
PyObject* ToPyObject(const std::vector<float>& value);
PyObject* ToPyObject(const std::vector<double>& value);
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize = false);
PyObject* ToPyObject(const platform::Place& value);
PyObject* ToPyObject(const framework::LoDTensor* value);
PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
......
......@@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
.def("params_file", &AnalysisConfig::params_file)
.def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
.def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
py::arg("gpu_fp16_disabled_op_types") =
std::unordered_set<std::string>({}))
.def("enable_xpu", &AnalysisConfig::EnableXpu,
py::arg("l3_workspace_size") = 16 * 1024 * 1024,
py::arg("locked") = false, py::arg("autotune") = true,
......
......@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
endif()
option(INFRT_WITH_PHI "Compile INFRT with PHI" ON)
option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF)
option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF)
#TODO(xiaowei) remove fluid
include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
if (INFRT_WITH_PHI)
add_definitions("-DINFRT_WITH_PHI")
add_definitions("-DINFRT_WITH_PHI")
# TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
if (INFRT_WITH_GPU)
add_definitions("-DINFRT_WITH_GPU")
if (INFRT_WITH_TRT)
add_definitions("-DINFRT_WITH_TRT")
endif()
endif()
endif()
# compile flags
......@@ -92,7 +102,6 @@ set(infrt_mlir_incs
test_kernels_inc
tensor_shape_inc
dense_tensor_inc
pd_ops_inc
pd_extra_ops_inc
trt_ops_inc
)
......@@ -106,6 +115,9 @@ if (INFRT_WITH_PHI)
endif()
cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
if (INFRT_WITH_TRT)
target_link_libraries(infrt infrt_trt)
endif()
cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
......
......@@ -13,6 +13,10 @@ limitations under the License. */
#include "paddle/phi/core/allocator.h"
#ifdef INFRT_WITH_GPU
#include <cuda_runtime.h>
#endif
namespace infrt {
namespace backends {
......@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
}
};
#ifdef INFRT_WITH_GPU
// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
class GpuPhiAllocator : public phi::Allocator {
public:
static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
AllocationPtr Allocate(size_t bytes_size) {
void* ptr;
cudaMalloc(&ptr, bytes_size);
return AllocationPtr(
new phi::Allocation(
ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
deleter);
}
};
#endif
} // namespace backends
} // namespace infrt
......@@ -13,6 +13,7 @@ limitations under the License. */
#include "paddle/infrt/backends/host/phi_allocator.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace infrt {
namespace backends {
......@@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext {
std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
};
class GpuPhiContext : public phi::GPUContext {
public:
using Base = phi::GPUContext;
using phi::GPUContext::SetStream;
using phi::GPUContext::SetEigenDevice;
using phi::GPUContext::SetBlasHandle;
using phi::GPUContext::SetDnnHandle;
using phi::GPUContext::SetSolverHandle;
using phi::GPUContext::SetSparseHandle;
};
} // namespace backends
} // namespace infrt
......@@ -37,9 +37,9 @@ namespace infrt {
namespace backends {
namespace tensorrt {
const char* model_input = "model_input";
const char* model_output = "model_output1";
const char* model_output2 = "model_output2";
const char* model_input = "input_0";
const char* model_output = "output_0";
const char* model_output2 = "output_1";
TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
......@@ -122,27 +122,26 @@ TEST(trt, run_static) {
std::unordered_map<std::string, phi::DenseTensor*> inputs;
inputs.emplace(std::make_pair(model_input, &input));
phi::DenseTensor output, output2;
std::unordered_map<std::string, phi::DenseTensor*> outputs;
outputs.emplace(std::make_pair(model_output, &output));
outputs.emplace(std::make_pair(model_output2, &output2));
static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
static_trt_engine.PrepareOutputHandle("output_0");
static_trt_engine.PrepareOutputHandle("output_1");
static_trt_engine.SetUpInference(inference_options, inputs);
static_trt_engine.GetEngineInfo();
static_trt_engine.Run(context);
phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
paddle::memory::Copy(phi::CPUPlace(),
output_data1.data(),
place,
output.data<float>(),
output0->data<float>(),
sizeof(float) * output_data1.size(),
context.stream());
paddle::memory::Copy(phi::CPUPlace(),
output_data2.data(),
place,
output2.data<float>(),
output1->data<float>(),
sizeof(float) * output_data2.size(),
context.stream());
cudaStreamSynchronize(context.stream());
......@@ -208,27 +207,27 @@ TEST(trt, run_dynamic) {
context.stream());
std::unordered_map<std::string, phi::DenseTensor*> inputs;
std::unordered_map<std::string, phi::DenseTensor*> outputs;
inputs.emplace(std::make_pair(model_input, &input));
outputs.emplace(std::make_pair(model_output, &output));
outputs.emplace(std::make_pair(model_output2, &output2));
engine.SetUpInference(inference_options, inputs, &outputs);
engine.PrepareOutputHandle("output_0");
engine.PrepareOutputHandle("output_1");
engine.SetUpInference(inference_options, inputs);
engine.GetEngineInfo();
engine.Run(context);
phi::DenseTensor* output0 = engine.GetOutput("output_0");
phi::DenseTensor* output1 = engine.GetOutput("output_1");
std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
paddle::memory::Copy(phi::CPUPlace(),
output_data1.data(),
place,
output.data<float>(),
output0->data<float>(),
sizeof(float) * output_data1.size(),
context.stream());
paddle::memory::Copy(phi::CPUPlace(),
output_data2.data(),
place,
output2.data<float>(),
output1->data<float>(),
sizeof(float) * output_data2.size(),
context.stream());
cudaStreamSynchronize(context.stream());
......
......@@ -21,6 +21,7 @@
#include "paddle/phi/backends/dynload/tensorrt.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
namespace infrt {
namespace backends {
......@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
return true;
}
void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
phi::DenseTensor t;
outputs_.emplace(out_name, t);
}
phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
return &outputs_[name];
}
size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
bool TrtEngine::SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
// TODO(wilber): now only create one exec_context
FreshDeviceId();
CHECK(engine_ != nullptr);
......@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
bindings_.front()->AddBinding(
bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
}
for (auto& it : *outputs) {
for (auto& it : outputs_) {
const int bind_index = engine_->getBindingIndex(it.first.c_str());
bindings_.front()->AddBinding(
bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
}
return true;
......@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
const int bind_index = engine_->getBindingIndex(bind.name.c_str());
std::vector<int32_t> ddim;
auto dims = engine_->getBindingDimensions(bind_index);
CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
ddim.push_back(runtime_batch);
for (int i = 0; i < dims.nbDims; ++i) {
ddim.push_back(dims.d[i]);
}
bind.buffer->Resize(phi::make_ddim(ddim));
// TODO(wilber): now only support float output.
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
}
......
......@@ -81,11 +81,17 @@ class TrtEngine {
// TODO(wilber): How to support multiple execution contexts?
bool SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
std::unordered_map<std::string, phi::DenseTensor*>* outputs);
const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
void GetEngineInfo();
void PrepareOutputHandle(const std::string& out_name);
// TODO(wilber): The output tensor names are: output_0, output_1, ...
phi::DenseTensor* GetOutput(const std::string&);
size_t GetOutputNum() const;
private:
void FreshDeviceId();
......@@ -112,6 +118,7 @@ class TrtEngine {
std::vector<std::unique_ptr<Bindings>> bindings_;
int device_id_{0};
bool is_dynamic_shape_{false};
std::unordered_map<std::string, phi::DenseTensor> outputs_;
};
} // namespace tensorrt
......
......@@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS
dense_tensor.cc
mlir_loader.cc
diagnostic_utils.cc
pd_ops.cc
)
mlir_tablegen_on(tensor_shape DIALECT ts)
mlir_tablegen_on(dense_tensor DIALECT dt)
mlir_tablegen_on(pd_op_base DIALECT pd)
mlir_tablegen_on(pd_ops)
mlir_tablegen_on(pd_extra_ops)
mlir_add_rewriter(rewrite)
# TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
add_executable(infrtopt opt.cc)
......@@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt)
add_executable(print-ir print_ir.cc)
target_link_libraries(print-ir infrt ${mlir_libs})
add_dependencies(print-ir pd_ops_inc)
cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
add_subdirectory(infrt)
add_subdirectory(pd)
add_subdirectory(tensorrt)
if (INFRT_WITH_PHI)
......
......@@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
}
def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
let summary = "ddt.tensor_map_get_size operation";
let summary = "dt.tensor_map_get_size operation";
let description = [{
An operation that get the size of a TensorMap.
......@@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
}
def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
let summary = "dt.tensor_list_get_tensor operation";
let description = [{
An operation that can get a tensor from a TensorList.
}];
let arguments = (ins
DenseTensorList:$l,
I32Attr:$id
);
let results = (outs DenseTensor:$output);
let verifier = ?;
}
def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
let summary = "dt.tensor_list_get_size operation";
let description = [{
An operation that get the size of a TensorList.
}];
let arguments = (ins DenseTensorList:$map);
let results = (outs I32:$size);
}
def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
let summary = "dt.get_tensor_shape operation";
......
......@@ -89,6 +89,13 @@ def DenseTensorMap : Infrt_Type<"DenseTensorMap"> {
let parameters = (ins);
}
// TODO(wilber): Add !infrt.vec type.
def DenseTensorList : Infrt_Type<"DenseTensorList"> {
let summary = "infrt dense tensor map";
let description = [{dense_tensor map}];
let parameters = (ins);
}
// Type Constrait for concrete DenseTensor type.
class DenseTensor<string target, string precision, string layout> :
Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">,
......
......@@ -138,6 +138,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
parser.getContext(), *targetType, *precisionType, *layoutType);
}
if (keyword == "tensor_list") {
return infrt::DenseTensorListType::get(parser.getContext());
}
if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext());
}
......@@ -175,6 +179,9 @@ void InfrtDialect::printType(::mlir::Type type,
return;
}
if (type.isa<infrt::DenseTensorListType>()) {
os << "tensor_list";
}
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorMapType>()) {
os << "dense_tensor_map";
......
......@@ -3,7 +3,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
def FuseTensorCastPattern : Pat<
(Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
......
......@@ -16,7 +16,7 @@
#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace {
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT
......
......@@ -20,12 +20,13 @@
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
#include "paddle/infrt/dialect/tensor_shape.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace infrt {
void registerCinnDialects(mlir::DialectRegistry &registry) { // NOLINT
......@@ -37,7 +38,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) { // NOLINT
phi::PHIDenseTensorDialect,
phi::PHICPUKernelDialect,
phi::PHIGPUKernelDialect,
phi::PHIDialect
phi::PHIDialect,
infrt::trt::TensorRTDialect
#endif
>();
}
......
add_subdirectory(common)
add_subdirectory(ir)
add_subdirectory(pass)
core_gather_headers()
gather_srcs(infrt_src SRCS
)
core_gather_headers()
gather_srcs(infrt_src SRCS
pd_ops.cc
)
add_mlir_dialect(pd_ops pd)
mlir_tablegen_on(pd_extra_ops)
......@@ -4,7 +4,7 @@
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/LoopLikeInterface.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/pd_op_base.td"
include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
let summary = "Computes the Fully Connected result of two tensors";
......
......@@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
def PD_Dialect : Dialect {
def Paddle_Dialect : Dialect {
let name = "pd";
let description = [{
......@@ -16,12 +16,12 @@ def PD_Dialect : Dialect {
This dialect contains the PaddlePaddle operators.
}];
let hasConstantMaterializer = 1;
let cppNamespace = "mlir::pd";
}
class PD_Op<string mnemonic, list<OpTrait> traits = []> :
Op<PD_Dialect, mnemonic, traits>;
Op<Paddle_Dialect, mnemonic, traits>;
class PD_PaddleAttr <string name, string description> :
......
......@@ -12,29 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include <mlir/IR/Matchers.h>
#include <mlir/IR/PatternMatch.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT
namespace mlir {
namespace pd {
#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT
PaddleDialect::PaddleDialect(MLIRContext *context)
: Dialect("pd", context, TypeID::get<PaddleDialect>()) {
void PaddleDialect::initialize() {
addOperations<
#define GET_OP_LIST
#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT
,
#define GET_OP_LIST
#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT
>();
}
......@@ -73,106 +71,5 @@ mlir::OpFoldResult ConstantOp::fold(
::llvm::ArrayRef<mlir::Attribute> operands) {
return value();
}
/*
LogicalResult ElementwiseAdd::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
*/
void Elementwise_addOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseMulAdd>(context);
}
/*
mlir::OpFoldResult ElementwiseAdd::fold(
llvm::ArrayRef<mlir::Attribute> operands) {
if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
if (!operands[0] || !operands[1]) return {};
DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
if (!lhs || !rhs) return {};
ShapedType type = getType().template cast<ShapedType>();
if (!type.hasStaticShape()) return {};
Type etype = type.getElementType();
if (!etype.isa<FloatType>()) return {};
SmallVector<APFloat, 6> values;
values.reserve(lhs.getNumElements());
for (const auto zip :
llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
values.push_back(
std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
}
return DenseElementsAttr::get(type, values);
}
return {};
}
LogicalResult ElementwiseDiv::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult ElementwiseMul::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult ElementwiseSub::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult MulOp::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
void ReluOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseFCRelu>(context);
}
void FusedRepeatedFCRelu::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseRepeatedFCRelu2>(context);
}
void BatchNormOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseBatchNormWithConvPattern>(context);
}*/
} // namespace pd
} // namespace mlir
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,49 +14,20 @@
#pragma once
#include <mlir/Dialect/Traits.h>
#include <mlir/IR/Attributes.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/BuiltinOps.h>
//===----------------------------------------------------------------------===//
// Dialect
//===----------------------------------------------------------------------===//
#include <llvm/ADT/StringMap.h>
#include <mlir/IR/BuiltinTypes.h>
#include <mlir/IR/Dialect.h>
#include <mlir/IR/Matchers.h>
#include <mlir/IR/OpDefinition.h>
#include <mlir/IR/OpImplementation.h>
#include <mlir/IR/TypeUtilities.h>
#include <mlir/Interfaces/CallInterfaces.h>
#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
#include <mlir/Interfaces/InferTypeOpInterface.h>
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
namespace mlir {
namespace pd {
class PaddleDialect : public Dialect {
public:
explicit PaddleDialect(MLIRContext* context);
static StringRef getDialectNamespace() { return "pd"; }
/// A hook used to materialize constant values with the given type.
Operation* materializeConstant(OpBuilder& builder,
Attribute value,
Type type,
Location loc) override;
Type parseType(DialectAsmParser& parser) const override {
return Dialect::parseType(parser);
}
void printType(Type type, DialectAsmPrinter& printer) const override {
Dialect::printType(type, printer);
}
};
} // namespace pd
} // namespace mlir
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_ops.hpp.inc"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
core_gather_headers()
gather_srcs(infrt_src SRCS
pd_op_fuse_pass.cc
)
mlir_add_rewriter(pd_op_fuse)
......@@ -3,8 +3,8 @@
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd_extra_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
//===----------------------------------------------------------------------===//
// This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT
#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace {
#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc" // NOLINT
/*
* PdOpFusePass.
*/
struct PdOpFusePass
: public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
public:
::llvm::StringRef getName() const override { return "PdOpFusePass"; }
llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
void runOnFunction() override;
};
// Implementation of the PdOpFusePass.
void PdOpFusePass::runOnFunction() {
::mlir::RewritePatternSet patterns(&getContext());
populateWithGenerated(patterns);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
} // namespace
mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mlir/Pass/Pass.h>
namespace infrt {
/*
* PdOpFusePass.
*/
std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
} // namespace infrt
......@@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect {
class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
class CreateDenseTensorOp
: PDT_Op<"create_dense_tensor", [NoSideEffect]> {
class CreateDenseTensorOp<string target>
: PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
let arguments = (ins Context:$context, I64ArrayAttr:$dims,
LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
let results = (outs DenseTensor:$output);
......@@ -51,9 +51,11 @@ class CreateContextOp<string target>
let results = (outs Context:$output);
}
def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
def PDT_PrintDenseTensor : PrintDenseTensorOp;
def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
......
......@@ -32,6 +32,7 @@
#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/core/kernel_factory.h"
#include "paddle/phi/ops/compat/signatures.h"
namespace {
......@@ -94,42 +95,49 @@ void PhiOpConvertPass::convertStage() {
// Todo: print log
continue;
}
::phi::KernelSignature kernel_sign =
::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
infrt::ProtoArgumentMappingContext(op));
// resort input&output according to kernel_sign
::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
::llvm::SmallVector<mlir::Type, 4> output_types;
for (const std::string &str : std::get<0>(kernel_sign.args)) {
if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
LOG(ERROR) << "No input info for Op " << op_name << " and argument "
<< str;
return;
auto loc = getFunction().getLoc();
builder.setInsertionPoint(op);
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
std::string kernel_name = phi::TransToPhiKernelName(op_name);
auto kernel_op = builder.create<infrt::KernelOp>(loc,
op->getResultTypes(),
op->getOperands(),
kernel_name,
op->getAttrDictionary());
op->replaceAllUsesWith(kernel_op.getResults());
} else {
::phi::KernelSignature kernel_sign =
::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
infrt::ProtoArgumentMappingContext(op));
// resort input&output according to kernel_sign
::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
::llvm::SmallVector<mlir::Type, 4> output_types;
for (const std::string &str : std::get<0>(kernel_sign.args)) {
if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
LOG(ERROR) << "No input info for Op " << op_name << " and argument "
<< str;
return;
}
uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
inputs.push_back(op->getOperands()[index]);
}
uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
inputs.push_back(op->getOperands()[index]);
}
for (const std::string &str : std::get<2>(kernel_sign.args)) {
if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
LOG(ERROR) << "No output info for Op " << op_name << " and argument "
<< str;
return;
for (const std::string &str : std::get<2>(kernel_sign.args)) {
if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
LOG(ERROR) << "No output info for Op " << op_name << " and argument "
<< str;
return;
}
uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
output_types.push_back(op->getResultTypes()[index]);
ori_output.push_back(op->getResult(index));
}
auto kernel_op = builder.create<infrt::KernelOp>(
loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
for (size_t index = 0; index < ori_output.size(); ++index) {
ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
}
uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
output_types.push_back(op->getResultTypes()[index]);
ori_output.push_back(op->getResult(index));
}
auto loc = getFunction().getLoc();
builder.setInsertionPoint(op);
auto kernel_op = builder.create<infrt::KernelOp>(
loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
for (size_t index = 0; index < ori_output.size(); ++index) {
ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
}
CHECK(op->use_empty());
op->erase();
}
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <mlir/IR/Operation.h>
#include <unordered_map>
#include "paddle/infrt/dialect/pd_ops_info.h"
#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
#include "paddle/phi/core/compat/arg_map_context.h"
namespace infrt {
......
......@@ -3,7 +3,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
include "paddle/infrt/dialect/tensorrt/trt_ops.td"
def PD2TRT_Matmul_Lower : Pat<
......
......@@ -17,11 +17,12 @@
#include <llvm/ADT/SetVector.h>
#include <mlir/Analysis/SliceAnalysis.h>
#include <mlir/IR/Builders.h>
#include <paddle/infrt/dialect/pd_ops.h>
#include <list>
#include <unordered_set>
#include <vector>
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
namespace {
......
......@@ -15,7 +15,7 @@
#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
#include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -14,7 +14,7 @@
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include <mlir/IR/Builders.h>
#include <mlir/Transforms/DialectConversion.h>
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
namespace infrt {
......
......@@ -17,7 +17,7 @@
#include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -21,6 +21,10 @@
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
namespace infrt {
namespace trt {
......
......@@ -30,7 +30,7 @@
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
let summary = "trt CreateEngine Op";
......@@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<
Describe a tensorrt subgraph.
}];
let regions = (region SizedRegion<1>:$body);
let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
let results = (outs TRT_EngineType:$output);
let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
let results = (outs TRT_EngineType:$engine);
}
def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
......@@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
let description = [{
Describe a tensorrt runtime.
}];
let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
let results = (outs Variadic<TRT_Tensor>:$output);
let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
let results = (outs Variadic<DenseTensor>:$output);
}
def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
let summary = "trt compute engine";
let description = [{
execute engine
}];
let arguments = (ins TRT_EngineType:$engine, Context:$context);
let results = (outs DenseTensorList:$outputs);
}
def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
let summary = "trt inspect engine";
let description = [{
Show engine
}];
let arguments = (ins TRT_EngineType:$engine);
}
def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
......@@ -34,11 +53,11 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
TensorRT IActivationLayer.
}];
let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type,
let arguments = (ins DenseTensor:$input, SI32Attr:$activation_type,
DefaultValuedAttr<F32Attr, "0.0">:$alpha,
DefaultValuedAttr<F32Attr, "0.0">:$beta);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
......@@ -48,9 +67,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
TensorRT IElementWiseLayer.
}];
let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
let arguments = (ins DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
......@@ -60,10 +79,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
TensorRT IMatrixMultiplyLayer.
}];
let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1,
TRT_Tensor:$input2, BoolAttr:$transpose2);
let arguments = (ins DenseTensor:$input1, BoolAttr:$transpose1,
DenseTensor:$input2, BoolAttr:$transpose2);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
#endif // TRT_OPS
......@@ -33,7 +33,10 @@
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#endif
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
#include "paddle/infrt/kernel/tensorrt/registry.h"
#endif // INFRT_WITH_GPU && INFRT_WITH_TRT
#endif // INFRT_WITH_PHI
static llvm::cl::list<std::string> cl_shared_libs( // NOLINT
"shared_libs",
......@@ -62,6 +65,9 @@ int main(int argc, char** argv) {
#ifdef INFRT_WITH_PHI
kernel::RegisterPhiKernels(&registry);
kernel::RegisterInferShapeLaunchers(&registry);
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
kernel::RegisterTrtKernels(&registry);
#endif // INFRT_WITH_GPU && INFRT_WITH_TRT
#endif
// load extra shared library
......
......@@ -16,12 +16,14 @@
#include <llvm/Support/SourceMgr.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
#include <mlir/IR/BuiltinAttributes.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/BuiltinTypes.h>
#include <mlir/IR/Diagnostics.h>
#include <mlir/IR/OperationSupport.h>
#include <mlir/Parser.h>
#include <glog/logging.h>
#include <iostream>
#include <memory>
#include <string>
......@@ -42,6 +44,13 @@
#include "paddle/infrt/host_context/value.h"
#include "paddle/infrt/tensor/tensor_shape.h"
#ifdef INFRT_WITH_PHI
#ifdef INFRT_WITH_TRT
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#endif
#include "paddle/phi/core/dense_tensor.h"
#endif
namespace infrt {
namespace host_context {
......@@ -277,33 +286,58 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
// TODO(wilber): Find a more appropriate way to handle special cases.
if (op->getName().getStringRef() == "trt.create_engine") {
#ifdef INFRT_WITH_TRT
auto* symbols = impl_->runtime->symbol_table();
::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
mlir_operation.operation = op;
mlir_operation.symbol_table = symbols;
impl_->cur_op->AppendArgument(new Value(mlir_operation));
// TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
// add a naive implement.
for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
auto operand = op->getOperand(i);
if (operand.isa<mlir::BlockArgument>()) {
mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
Value* arg_value = GetValue(arg);
if (arg_value->is_type<phi::DenseTensor>()) {
impl_->runtime->FeedInArgs(
std::make_pair(std::to_string(i), ValueRef(arg_value)));
}
}
}
#else
CHECK(false) << "should not reach here";
#endif
} else {
// process operands
for (int i = 0, e = op->getNumOperands(); i < e; i++) {
// function argument as value
auto operand = op->getOperand(i);
/// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
if (operand.isa<mlir::BlockArgument>()) {
mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
Value* arg_value = GetValue(arg);
impl_->cur_op->AppendArgument(arg_value);
VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
<< GetValue(arg);
continue;
}
// process operands
for (int i = 0, e = op->getNumOperands(); i < e; i++) {
// function argument as value
auto operand = op->getOperand(i);
/// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
if (operand.isa<mlir::BlockArgument>()) {
mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
Value* arg_value = GetValue(arg);
// normal value
Value* arg_value = GetValue(operand);
if (!arg_value) {
auto upstream_op = operand.getDefiningOp();
arg_value = GetOpResult(upstream_op);
}
CHECK(arg_value) << "No-exist argument value found: "
<< DumpToString(operand);
impl_->cur_op->AppendArgument(arg_value);
VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
<< GetValue(arg);
continue;
}
// normal value
Value* arg_value = GetValue(operand);
if (!arg_value) {
auto upstream_op = operand.getDefiningOp();
arg_value = GetOpResult(upstream_op);
VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
<< GetValue(operand) << " vs " << arg_value;
}
CHECK(arg_value) << "No-exist argument value found: "
<< DumpToString(operand);
impl_->cur_op->AppendArgument(arg_value);
VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
<< GetValue(operand) << " vs " << arg_value;
}
// process attributes
......@@ -383,33 +417,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->cur_op->AppendAttribute(tmp[i]);
}
// process results
llvm::SmallVector<Value*, 4> res_values;
for (int i = 0, e = op->getNumResults(); i < e; i++) {
auto res = op->getResult(i);
if (res.getType().isa<::infrt::DenseTensorType>()) {
auto r = impl_->value_map.try_emplace(
res, ValueRef(new Value{::phi::DenseTensor()}));
CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
<< "]";
res_values.push_back(r.first->second.get());
} else {
res_values.push_back(AddValue(res));
}
VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
}
impl_->cur_op->SetResults(res_values);
#ifdef INFRT_DEBUG
{
VLOG(3) << "check result";
for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
}
}
#endif
// process regions, we treat regions as attribute.
auto num_regions = op->getNumRegions();
if (num_regions > 0) {
......@@ -438,6 +445,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->cur_op->AppendAttribute(new Value(function));
}
// process results
llvm::SmallVector<Value*, 4> res_values;
for (int i = 0, e = op->getNumResults(); i < e; i++) {
auto res = op->getResult(i);
if (res.getType().isa<::infrt::DenseTensorType>()) {
auto r = impl_->value_map.try_emplace(
res, ValueRef(new Value{::phi::DenseTensor()}));
CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
<< "]";
res_values.push_back(r.first->second.get());
} else {
res_values.push_back(AddValue(res));
}
VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
}
impl_->cur_op->SetResults(res_values);
#ifdef INFRT_DEBUG
{
VLOG(3) << "check result";
for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
}
}
#endif
return true;
}
......
......@@ -15,7 +15,7 @@
#include "paddle/infrt/host_context/paddle_mlir.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops_info.h"
#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
MLIRModelGenImpl::MLIRModelGenImpl()
: context_(infrt::Global::getMLIRContext()), builder_(context_) {
......
......@@ -14,22 +14,22 @@
#ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
#define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
#include <llvm/Support/CommandLine.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
#include <mlir/IR/AsmState.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/MLIRContext.h>
#include <fstream>
#include <iostream>
#include <string>
#include "llvm/Support/CommandLine.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/MLIRContext.h"
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/common/string.h"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/init_dialects.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/tensor_shape.h"
#include "paddle/infrt/paddle/model_parser.h"
......
......@@ -24,6 +24,7 @@
#include "paddle/infrt/common/shared.h"
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/host_context/function.h"
#include "paddle/infrt/host_context/symbol_table.h"
#include "paddle/infrt/support/variant.h"
#include "paddle/infrt/tensor/dense_host_tensor.h"
#include "paddle/infrt/tensor/dense_tensor_view.h"
......@@ -41,7 +42,15 @@
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#endif
#ifdef INFRT_WITH_GPU
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif // INFRT_WITH_GPU
#ifdef INFRT_WITH_TRT
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#endif // INFRT_WITH_TRT
#endif // INFRT_WITH_PHI
namespace infrt {
namespace host_context {
......@@ -72,8 +81,13 @@ using ValueVariantType =
::phi::MetaTensor,
::phi::DenseTensor,
backends::CpuPhiContext,
#ifdef INFRT_WITH_GPU
backends::GpuPhiContext,
::phi::GPUContext,
#endif
::phi::CPUContext,
std::vector<const phi::DenseTensor*>,
std::vector<phi::DenseTensor*>,
paddle::experimental::ScalarBase<phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
std::vector<phi::MetaTensor*>,
......@@ -81,6 +95,10 @@ using ValueVariantType =
paddle::experimental::Backend,
paddle::experimental::DataLayout,
paddle::experimental::DataType,
#ifdef INFRT_WITH_TRT
::infrt::backends::tensorrt::TrtEngine,
::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
#endif // INFRT_WITH_TRT
#endif
std::vector<int16_t>,
std::vector<int32_t>,
......@@ -120,8 +138,18 @@ class Value : public common::Object {
#ifdef INFRT_WITH_PHI
explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_GPU
explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
#endif
explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_TRT
explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
: data(std::move(x)) {}
explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
: data(x) {}
#endif // INFRT_WITH_TRT
#endif
template <typename T>
......
add_subdirectory(phi)
add_subdirectory(tensorrt)
core_gather_headers()
......
......@@ -25,6 +25,16 @@ namespace phi {
return ctx;
}
#ifdef INFRT_WITH_GPU
::phi::GPUContext CreateGPUContext() {
::phi::GPUContext context;
context.PartialInitWithoutAllocator();
context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
context.PartialInitWithAllocator();
return context;
}
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -25,6 +25,10 @@ namespace phi {
::phi::CPUContext CreateCPUContext();
#ifdef INFRT_WITH_GPU
::phi::GPUContext CreateGPUContext();
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -15,6 +15,12 @@
#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
#include "paddle/infrt/dialect/phi/data_type.h"
#include "paddle/infrt/kernel/phi/context_kernels.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#ifdef INFRT_WITH_GPU
#include <cuda_runtime.h>
#endif
namespace infrt {
namespace kernel {
......@@ -34,26 +40,83 @@ namespace phi {
{}));
}
::phi::DenseTensor CreateGPUDenseTensor(
const ::phi::GPUContext& context,
host_context::Attribute<std::vector<int64_t>> dims,
host_context::Attribute<std::vector<int64_t>> lod,
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision) {
return ::phi::DenseTensor(
const_cast<::phi::Allocator*>(&context.GetAllocator()),
::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
::phi::make_ddim(dims.get()),
ConvertLayoutToPhi(layout.get()),
{}));
}
void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
host_context::Attribute<std::vector<float>> value) {
auto place = ::phi::CPUPlace();
auto place = dense_tensor->place();
float* a_data = dense_tensor->mutable_data<float>(place);
for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
a_data[i] = (value.get())[i];
if (place.GetType() == ::phi::AllocationType::CPU) {
for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
a_data[i] = (value.get())[i];
}
} else if (place.GetType() == ::phi::AllocationType::GPU) {
#ifdef INFRT_WITH_GPU
// TODO(wilber): how to set the stream parameter to copy with stream.
cudaMemcpy(a_data,
value.get().data(),
sizeof(float) * value.get().size(),
cudaMemcpyHostToDevice);
#endif
} else {
llvm_unreachable("temporarily not support other target.");
}
}
void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \
case ::phi::DataType::PHI_DATATYPE: { \
DTYPE* data = dense_tensor->data<DTYPE>(); \
if (dense_tensor->numel() == 0) break; \
std::cout << data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << data[i]; \
} \
break; \
#ifndef INFRT_WITH_GPU
#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \
case ::phi::DataType::PHI_DATATYPE: { \
auto place = dense_tensor->place(); \
if (place.GetType() == ::phi::AllocationType::CPU) { \
DTYPE* data = dense_tensor->data<DTYPE>(); \
if (dense_tensor->numel() == 0) break; \
std::cout << data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << data[i]; \
} \
} \
break; \
}
#else
#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \
case ::phi::DataType::PHI_DATATYPE: { \
auto place = dense_tensor->place(); \
DTYPE* data = dense_tensor->data<DTYPE>(); \
if (dense_tensor->numel() == 0) break; \
if (place.GetType() == ::phi::AllocationType::CPU) { \
std::cout << data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << data[i]; \
} \
} else if (place.GetType() == ::phi::AllocationType::GPU) { \
std::vector<DTYPE> host_data(dense_tensor->numel(), 0); \
cudaMemcpy(host_data.data(), \
data, \
sizeof(DTYPE) * dense_tensor->numel(), \
cudaMemcpyDeviceToHost); \
std::cout << host_data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << host_data[i]; \
} \
} else { \
llvm_unreachable("temporarily not support other target."); \
} \
break; \
}
#endif
::phi::DDim dims = dense_tensor->dims();
std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
......
......@@ -30,6 +30,13 @@ namespace phi {
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision);
::phi::DenseTensor CreateGPUDenseTensor(
const ::phi::GPUContext& context,
host_context::Attribute<std::vector<int64_t>> dims,
host_context::Attribute<std::vector<int64_t>> lod,
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision);
void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
host_context::Attribute<std::vector<float>> values);
void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
......
......@@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
registry->AddKernel("phi_dt.create_context.cpu",
INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
registry->AddKernelWithAttrs(
"phi_dt.create_dense_tensor",
"phi_dt.create_dense_tensor.cpu",
INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
{"dims", "lod", "layout", "precision"});
registry->AddKernelWithAttrs(
......@@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
{"value"});
registry->AddKernel("phi_dt.print_tensor",
INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
#ifdef INFRT_WITH_GPU
registry->AddKernel("phi_dt.create_context.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
registry->AddKernelWithAttrs(
"phi_dt.create_dense_tensor.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
{"dims", "lod", "layout", "precision"});
#endif
}
} // namespace kernel
......
......@@ -25,6 +25,10 @@
#include "paddle/infrt/tensor/tensor_map.h"
#include "paddle/infrt/tensor/tensor_shape.h"
#ifdef INFRT_WITH_PHI
#include "paddle/phi/core/dense_tensor.h"
#endif
namespace infrt {
namespace kernel {
using namespace host_context; // NOLINT
......@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
// TODO(wilber): Maybe we should place TensorList type in dt dialect.
#ifdef INFRT_WITH_PHI
phi::DenseTensor TensorListGetTensor(std::vector<phi::DenseTensor *> list,
Attribute<int32_t> idx) {
CHECK_LT(idx.get(), static_cast<int>(list.size()))
<< "idx should less than list size";
return *list[idx.get()];
}
int32_t TensorListGetSize(const std::vector<phi::DenseTensor *> &list) {
return list.size();
}
#endif
DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
template <typename T>
......@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
INFRT_KERNEL(TensorMapGetTensor));
registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
// TensorList related methods.
#ifdef INFRT_WITH_PHI
registry->AddKernel("dt.tensor_list_get_tensor",
INFRT_KERNEL(TensorListGetTensor));
registry->AddKernel("dt.tensor_list_get_size",
INFRT_KERNEL(TensorListGetSize));
#endif
registry->AddKernel("dt.shallow_copy_tensor",
INFRT_KERNEL(ShallowCopyTensor));
......
if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
return()
endif()
core_gather_headers()
gather_srcs(infrt_src SRCS
registry.cc
trt_kernels.cc
)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/kernel/tensorrt/registry.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/kernel_utils.h"
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
namespace infrt {
namespace kernel {
void RegisterTrtKernels(host_context::KernelRegistry* registry) {
registry->AddKernel("trt.create_engine",
INFRT_KERNEL(tensorrt::CreateTrtEngine));
registry->AddKernel("trt.inspect_engine",
INFRT_KERNEL(tensorrt::PrintTrtLayer));
registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
}
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
namespace infrt {
namespace host_context {
struct KernelRegistry;
} // namespace host_context
} // namespace infrt
namespace infrt {
namespace kernel {
/**
* Register all the trt kernels to registry.
*/
void RegisterTrtKernels(host_context::KernelRegistry* registry);
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#include <string>
#include "NvInfer.h"
#include "NvInferRuntime.h"
#include "NvInferRuntimeCommon.h"
#include "glog/logging.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/raw_ostream.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/infrt/backends/tensorrt/trt_options.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
#include "paddle/infrt/host_context/symbol_table.h"
#include "paddle/phi/core/dense_tensor.h"
namespace infrt {
namespace kernel {
namespace tensorrt {
::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
MlirOperationWithInfrtSymbol
create_engine_op /*, input_tensors, output_tensors, weights*/) {
// TODO(wilber): The device_id needs to get from mlir.
int device_id = 0;
backends::tensorrt::TrtEngine engine(device_id);
auto* builder = engine.GetTrtBuilder();
// TODO(wilber): How to process weights?
backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
// TODO(wilber): static_shape or dynamic_shape network? The code is just
// static_shape test.
network.reset(builder->createNetworkV2(0));
// TODO(wilber): The build option shoule be fiiled from mlir info.
backends::tensorrt::BuildOptions options;
options.max_batch = 4;
// Parse mlir Region which only has one block.
mlir::Operation& operation = *create_engine_op.operation;
auto* symbol_table = create_engine_op.symbol_table;
CHECK_NOTNULL(symbol_table);
unsigned int num_regions = operation.getNumRegions();
CHECK_EQ(num_regions, 1U) << "only support one region case.";
auto& region = operation.getRegion(0);
auto& block = region.getBlocks().front();
llvm::DenseMap<mlir::Value, nvinfer1::ITensor*> map_info;
std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
for (auto index_operand : llvm::enumerate(operation.getOperands())) {
mlir::Value operand = index_operand.value();
size_t idx = index_operand.index();
const std::string input_name = "input_" + std::to_string(idx);
auto* v = symbol_table->GetValue(std::to_string(idx));
CHECK_NOTNULL(v);
auto* t = &v->get<phi::DenseTensor>();
trt_bind_inputs[input_name] = t;
// TODO(wilber): get input info from mlir.
// TODO(wilber): input dims, now only support static_shape, and just remove
// the first dimension.
// TODO(wilber): now only suppot float input.
nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1;
for (int i = 0; i < dims.nbDims; ++i) {
dims.d[i] = t->dims()[i + 1];
}
auto* in =
network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
map_info[operand] = in;
}
// TODO(wilber): Find a way to add layer.
for (auto& inner_op : block.without_terminator()) {
if (inner_op.getName().getStringRef() == "trt.Activation") {
trt::ActivationOp act_op = llvm::dyn_cast<trt::ActivationOp>(inner_op);
auto in_arg = act_op.getOperand();
if (!map_info.count(in_arg)) {
CHECK(false) << "map_info not has in_arg.";
}
nvinfer1::ActivationType act_type =
static_cast<nvinfer1::ActivationType>(act_op.activation_type());
auto* act_layer = network->addActivation(*map_info[in_arg], act_type);
act_layer->setAlpha(act_op.alpha().convertToFloat());
act_layer->setBeta(act_op.beta().convertToFloat());
for (size_t i = 0; i < act_op->getNumResults(); ++i) {
nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
mlir::Value act_out = act_op->getResult(i);
map_info[act_out] = act_out_tensor;
}
}
// if (inner_op.getName().getStringRef() == "trt.Constant") {
// trt::ConstantOp op = llvm::dyn_cast<trt::ConstantOp>(inner_op);
// mlir::Value op_out = op.getResult();
// std::vector<float> weight_data{1};
// auto* layer = network->addConstant(nvinfer1::Dims2(1, 1),
// nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1});
// auto* op_out_tenor = layer->getOutput(0);
// map_info[op_out] = op_out_tenor;
// }
}
for (auto& inner_op : block.without_terminator()) {
for (mlir::Value v : inner_op.getResults()) {
for (mlir::Operation* user : v.getUsers()) {
if (user->getName().getStringRef() == "infrt.return") {
if (!map_info.count(v)) {
CHECK(false) << "map_info not has value";
}
network->markOutput(*map_info[v]);
}
}
}
}
// std::unordered_map<std::string, phi::DenseTensor*> trt_bind_outputs;
mlir::Operation* ret = block.getTerminator();
for (unsigned int i = 0; i < ret->getNumOperands(); ++i) {
mlir::Value arg = ret->getOperand(i);
CHECK(map_info.count(arg));
map_info[arg]->setName(("output_" + std::to_string(i)).c_str());
}
for (int i = 0; i < network->getNbOutputs(); ++i) {
engine.PrepareOutputHandle(network->getOutput(i)->getName());
}
VLOG(3) << "trt engine build start.";
engine.Build(std::move(network), options);
VLOG(3) << "trt engine build done.";
// TODO(wilber): get inference options from mlir.
backends::tensorrt::InferenceOptions inference_options;
inference_options.batch = 1;
// TODO(wilber): bind trt input/output tensors.
engine.SetUpInference(inference_options, trt_bind_inputs);
return engine;
}
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
engine->GetEngineInfo();
}
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
engine->Run(context);
std::vector<phi::DenseTensor*> res;
for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
res.push_back(engine->GetOutput("output_" + std::to_string(i)));
}
return res;
}
} // namespace tensorrt
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <tuple>
#include <utility>
#include "mlir/IR/Operation.h"
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace infrt {
namespace host_context {
class SymbolTable;
} // namespace host_context
namespace kernel {
namespace tensorrt {
struct MlirOperationWithInfrtSymbol {
mlir::Operation* operation;
::infrt::host_context::SymbolTable* symbol_table;
};
::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
MlirOperationWithInfrtSymbol engine_op);
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
} // namespace tensorrt
} // namespace kernel
} // namespace infrt
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册