提交 4be77e53 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_temporal_shift_to_phi

......@@ -52,12 +52,12 @@ tools/__pycache__
# This file is automatically generated.
# TODO(zhiqiang) Move this file to build directory.
paddle/infrt/dialect/pd_ops.td
paddle/infrt/dialect/pd/ir/pd_ops.td
paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
tools/infrt/kernels.json
tools/infrt/kernel_signature.json
paddle/infrt/dialect/pd_ops_info.h
paddle/infrt/dialect/pd/common/pd_ops_info.h
.lit_test_times.txt
paddle/infrt/tests/dialect/Output
paddle/infrt/tests/lit.cfg.py
......
......@@ -24,7 +24,7 @@
#include "paddle/fluid/platform/errors.h"
#include "glog/logging.h"
DECLARE_bool(retain_grad_for_all_tensor);
namespace egr {
static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
......@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal(
......@@ -62,7 +62,7 @@ operator()(
grad_out = grads[0][0];
}
if (!weak_grad_.expired()) {
if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
auto grad = weak_grad_.lock();
CopyOrAddTensor(grad.get(), grad_out);
}
......
......@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
std::string name() { return "GradNodeAccumulation"; }
......
......@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
// 1. Check Output Size
PADDLE_ENFORCE(
((grads.size() == 1) && (grads[0].size() == 1)),
......
......@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetTensorWrappers_X(
const std::vector<paddle::experimental::Tensor>& tensors);
......
......@@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents(
const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
"bool create_graph) {\n%s\n}";
std::string grad_function_str = paddle::string::Sprintf(
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
......@@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents(
"\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph = false) "
"override;\n"
"\n"
" void ClearTensorWrappers() override { \n"
"%s\n"
" is_tensor_wrappers_cleared = true;\n"
" }\n"
" std::string name() override { return \" GradNode%s \"; } \n "
"\n"
" // SetX, SetY, ...\n"
"%s\n"
" // SetAttrMap\n"
"%s\n"
" bool IsTensorWrappersCleared() override { \n"
" return is_tensor_wrappers_cleared;\n"
" }\n"
" private:\n"
" // TensorWrappers\n"
"%s\n"
" bool is_tensor_wrappers_cleared = false;\n"
"\n"
" // Attribute Map\n"
"%s\n"
"};";
......@@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents(
std::string set_tensor_wrappers_str = "";
std::string tensor_wrapper_members_str = "";
std::string clear_tensor_wrappers_str = "";
for (const auto& iter : op_base_infos) {
const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
iter.GetGradInsFwdSlotnameMap();
......@@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
struct_tensor_wrapper_name);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
"for (auto tw: %s) {\n"
" tw.clear();\n"
" }\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
} else {
const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
"const paddle::experimental::Tensor& %s";
......@@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents(
TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
tensor_wrapper_body_str = paddle::string::Sprintf(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
tensor_wrapper_name, full_reserved_str);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
}
std::string full_reserved_signature_str = "bool full_reserved";
const char* SET_TENSOR_WRAPPER_TEMPLATE =
......@@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents(
std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
tensor_wrapper_members_str, attr_members_str);
op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
return grad_node_str;
}
......
......@@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
# SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
......@@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
"""
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
{}.clear();
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
else:
assert IsVectorTensorType(ttype)
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
......@@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
"""
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
for (auto tw: {}) {
tw.clear();
};
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
# End: SetTensorWrapper Methods & TensorWrapper Members
# SetAttributes & Attribute Members
......@@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private:
// TensorWrappers
{}
bool is_tensor_wrappers_cleared = false;
// Attributes
{}
}};
"""
node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, set_tensor_wrapper_methods_str,
set_attribute_methods_str, tensor_wrapper_members_str,
attribute_members_str)
grad_node_name, clear_tensor_wrapper_str,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
return node_declaration_str
......@@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
// Call grad_api function
auto grad_api_returns = {}::{}({});
{}
......
......@@ -39,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
size_t potential_startup_ops_cnt = queue.size();
size_t cnt = 0;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (cnt < potential_startup_ops_cnt) {
if (!node_in_degree_map.count(node)) {
node_in_degree_map[node] = 0;
}
cnt += 1;
}
if (visited.count(node)) {
continue;
}
......@@ -76,23 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
return node_in_degree_map;
}
void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) {
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
// Remove some nodes those doesn't need to be
// stored in potential_stop_nodes、potential_startup_nodes
void UpdateGraphInfo(
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map,
std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
// Updated potential_sotp_nodes by depending_nodes,
// make sure the path from root to target_node is ok
std::unordered_set<GradNodeBase*> _startup_ops;
VLOG(6) << "Running in UpdateGraphInfo";
std::queue<GradNodeBase*> queue;
for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
queue.emplace(target_nodes_inputmeta_pair.first);
}
while (!queue.empty()) {
auto* target_node = queue.front();
queue.pop();
if (!(*depending_nodes)[target_node].empty()) {
auto precedding_nodes = (*depending_nodes)[target_node];
for (auto pre_nodes : precedding_nodes) {
queue.emplace(pre_nodes);
if (potential_stop_nodes->find(pre_nodes) !=
potential_stop_nodes->end()) {
potential_stop_nodes->erase(pre_nodes);
}
}
} else { // startup_ops have no precedding nodes
VLOG(6) << "Emplace _startup_ops";
_startup_ops.emplace(target_node);
}
}
// Purify potential_startup_nodes again, remove some
// potential startup_nodes that unreach to input target nodes
if (!_startup_ops.empty()) {
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto node : *potential_startup_nodes) {
if (_startup_ops.count(node) == 0) {
VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
potential_startup_nodes_to_be_erased.emplace(node);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto node : potential_startup_nodes_to_be_erased) {
VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
potential_startup_nodes->erase(node);
}
}
}
}
// Get Graph Info Betweent input target gradnode and outputs,
// record depending_nodes、 potential_stop_nodes、potential_startup_nodes
void GetGraphInfoBetweenTargets(
const std::queue<GradNodeBase*>& init_queue,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
input_target_nodes_inputmeta_map,
std::unordered_map</*child node*/ GradNodeBase*,
/*father nodes*/ std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
if (input_target_nodes_inputmeta_map->empty()) return;
VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
// Calculate in_degree for each node
std::unordered_map<GradNodeBase*, int> node_in_degree_map;
// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (visited.count(node)) {
continue;
}
visited.insert(node);
// Check node is target_nodes or not, if node is not target_node,
// all the next_node will be marked in potential_stop_nodes
bool is_potential_stop_nodes =
input_target_nodes_inputmeta_map->count(node);
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
if (!next_node) continue;
// if node not in input_target_nodes,
// all the next_nodes of current node will be inserted to
// potential_stop_node
if (is_potential_stop_nodes) {
potential_stop_nodes->emplace(next_node);
}
// Update in_degree
if (!node_in_degree_map.count(next_node))
node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++;
// Record depending relationship
(*depending_nodes)[next_node].emplace(node);
queue.push(next_node);
}
}
}
// Update Graph Info, remove some stop_node in potential_stop_nodes
UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
potential_stop_nodes, potential_startup_nodes);
}
void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map) {
VLOG(6) << "Running in GetTargetNodesInfo";
if (!inputs.empty()) {
VLOG(6) << "Inputs are not empty";
size_t num_inputs = inputs.size();
for (size_t i = 0; i < num_inputs; i++) {
AutogradMeta* auto_grad_meta =
EagerUtils::unsafe_autograd_meta(inputs[i]);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
PADDLE_ENFORCE_NOT_NULL(target_node,
paddle::platform::errors::Fatal(
"There is no grad op for input:%d or it's"
"stop_gradient=True",
i));
(*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
}
}
}
std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
results_map,
bool allow_unused, bool create_graph) {
VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {};
std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
auto iter = results_map->find(target_node);
if (iter != results_map->end()) {
// set StopGradient = !create_graph
AutogradMeta* tensor_auto_grad_meta =
EagerUtils::autograd_meta(&(iter->second));
tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input variable or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
}
}
return results;
}
// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
VLOG(6) << "Running in EnforceGradNodeHasInput";
PADDLE_ENFORCE_NE(
node->IsTensorWrappersCleared(), true,
paddle::platform::errors::Fatal(
"The TensorWrappers of %s do not exist. This may be because:\n"
"You calculate backward twice for the same subgraph without "
"setting retain_graph=True. Please set retain_graph=True in the "
"first backward/grad call.\n",
node->name()));
}
// Purify potential_startup_nodes, remove nodes those are the same as
// input_target_nodes
void PurifyPotentialStartUpNodes(
std::unordered_set<GradNodeBase*>* potential_startup_nodes,
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>*
input_target_nodes_inputmeta_map) {
VLOG(6) << "Running in PurifyPotentialStartUpNodes";
if (input_target_nodes_inputmeta_map->empty()) return;
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto startup_op : *potential_startup_nodes) {
auto iter = input_target_nodes_inputmeta_map->find(startup_op);
if (iter != input_target_nodes_inputmeta_map->end()) {
potential_startup_nodes_to_be_erased.emplace(iter->first);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto nodes : potential_startup_nodes_to_be_erased) {
potential_startup_nodes->erase(nodes);
}
}
}
std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph = false,
const std::vector<paddle::experimental::Tensor>& inputs = {},
bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
VLOG(6) << "Start Backward";
// *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass
std::unordered_map<GradNodeBase*, AutogradMeta*>
no_grad_var_nodes_inputmeta_map;
// Get no_grad_vars's GradNodes and InputMeta Info
GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
/* --- Initialization --- */
// 1. Init queue with starting nodes
// 2. Prepare initial input buffers
std::queue<GradNodeBase*> queue;
std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
node_input_buffers_dict;
std::unordered_set<GradNodeBase*> potential_startup_nodes;
for (size_t i = 0; i < tensors.size(); i++) {
const paddle::experimental::Tensor& tensor = tensors[i];
......@@ -132,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
"size = 0 or same size as tensors"));
// Feed given tensor if it's provided
VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
if (grad_tensors[i].is_initialized()) {
// Deep copy
paddle::experimental::Tensor tmp_tensor;
tmp_tensor.copy_(grad_tensors[i], true);
node_input_buffers_dict[grad_node]->add(input_info.first,
input_info.second, tmp_tensor);
} else {
node_input_buffers_dict[grad_node]->add(
input_info.first, input_info.second, grad_tensors[i]);
}
} else {
VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
......@@ -146,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
input_info.first, input_info.second, tensor, true /*fill_one=true*/);
}
// Prepare queue
// Prepare queue, potential startup_nodes
queue.push(grad_node);
potential_startup_nodes.emplace(grad_node);
}
VLOG(6) << "Update In degree Map for backward";
......@@ -155,25 +399,74 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unordered_map<GradNodeBase*, int> node_in_degree_map =
getInDegreeMap(queue);
// Get input's GradNodes and InputMeta Info
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
input_target_nodes_inputmeta_map;
GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map);
// Purify potential_startup_ops, remove those nodes that are the same as
// input_target_nodes
PurifyPotentialStartUpNodes(&potential_startup_nodes,
&input_target_nodes_inputmeta_map);
// Get Graph Info Betweent input target gradnode and outputs
// Record the depending_nodes and potential_stop_nodes
std::unordered_map<GradNodeBase* /* child node */,
std::unordered_set<GradNodeBase*> /* father node */>
depending_nodes;
std::unordered_set<GradNodeBase*> potential_stop_nodes;
// std::unordered_set<GradNodeBase*> startup_ops;
GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
&depending_nodes, &potential_stop_nodes,
&potential_startup_nodes);
// ready_queue store all startup nodes
std::queue<GradNodeBase*> ready_queue;
// startup op's indegree should be 0
for (auto node : potential_startup_nodes) {
if (node_in_degree_map[node] == 0) {
ready_queue.emplace(node);
}
}
VLOG(1) << " startup_ops' size is :" << ready_queue.size();
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
// read_queue is empty only when 1.input equals to output. 2.input can not
// reach to output.
if (ready_queue.size() == 0) {
for (auto input_target_node : input_target_nodes_inputmeta_map) {
// out rank_info of forward op
auto rank_info = input_target_node.second->OutRankInfo();
if (node_input_buffers_dict[input_target_node.first]) {
auto& target_result =
node_input_buffers_dict[input_target_node.first]
->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[input_target_node.first] = target_result;
}
}
}
/* --- Topological Visit --- */
// 1. Pop queue
// 2. Run node
// |- Check and capture target result
// |- node(grads)
// |- Prepare for next node
// 3. Update queue
VLOG(6) << "Run Backward";
while (!queue.empty()) {
GradNodeBase* node = queue.front();
while (!ready_queue.empty()) {
GradNodeBase* node = ready_queue.front();
VLOG(6) << "Running GradNode:" << node->name();
ready_queue.pop();
paddle::platform::RecordEvent node_record_event(
std::string(typeid(*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop();
continue;
}
queue.pop();
// Run node: This is where Hook happens
PADDLE_ENFORCE(
node_input_buffers_dict.count(node),
......@@ -184,10 +477,45 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unique_ptr<GradTensorHolder> node_input_buffer =
std::move(node_input_buffers_dict[node]);
// get target grad_var from node_input_buffer by inputmeta
if (input_target_nodes_inputmeta_map.find(node) !=
input_target_nodes_inputmeta_map.end()) {
VLOG(6) << "Get target result by by inputmeta";
// out rank_info of forward op
auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
// rank_info is a pair, first means slot_id, second means rank.
auto& target_result =
node_input_buffer->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[node] = target_result;
}
// no_grad_vars
if (no_grad_var_nodes_inputmeta_map.find(node) !=
no_grad_var_nodes_inputmeta_map.end()) {
VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo();
node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
rank_info.second);
}
VLOG(6) << "Running GradNode:" << node->name();
// check input
EnforceGradNodeHasInput(node);
VLOG(6) << "Run Backward Kernel with GradTensorHolder";
// Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers());
(*node)(node_input_buffer->Buffers(), create_graph);
// retain_grad or not
if (!retain_graph) {
VLOG(6)
<< "retain_graph is false, need to clear the TensorWrapper of nodes.";
node->ClearTensorWrappers();
}
// TODO(jiabin): Should we erase it or find a more efficient way.
node_input_buffers_dict.erase(node);
......@@ -252,18 +580,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
// Update queue
node_in_degree_map[next_node]--;
PADDLE_ENFORCE(
node_in_degree_map[next_node] >= 0,
paddle::platform::errors::Fatal(
"Detected in-degree value smaller than zero. For Node: %s"
"Node's in-degree cannot be negative",
next_node->name()));
if (node_in_degree_map[next_node] == 0) {
queue.emplace(std::move(next_node));
bool is_potential_stop_node = potential_stop_nodes.count(next_node);
if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
ready_queue.emplace(std::move(next_node));
}
}
}
}
return GetResults(inputs, &results_map, allow_unused, create_graph);
}
void Backward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) {
VLOG(6) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
RunBackward(tensors, grad_tensors, retain_graph);
}
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad";
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
allow_unused, no_grad_vars);
}
} // namespace egr
......@@ -19,13 +19,21 @@
namespace egr {
// run_backward():
// Backward():
// tensors corresponds to those lived in the backward graph
// each grad_tensors[i] keeps the value for its corresponding tensors[i]
void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
const std::vector<paddle::experimental::Tensor> &grad_tensors,
void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph = false);
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
bool retain_graph = false, bool create_graph = false,
bool only_inputs = false, bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
// Reserved for gradient()
} // namespace egr
......@@ -20,8 +20,8 @@
namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
......
......@@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) override;
std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
......@@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase {
return res;
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
public:
......
......@@ -95,8 +95,12 @@ class GradNodeBase {
* is better choice to fit this format.
* **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) = 0;
virtual void ClearTensorWrappers() = 0;
virtual bool IsTensorWrappersCleared() = 0;
/**
* AddEdges is designed to set input tensors' backward Node as current
* node's Edges.
......
......@@ -21,6 +21,11 @@
namespace egr {
void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
buffer_[slot_id][rank] =
paddle::experimental::zeros_like(buffer_[slot_id][rank]);
}
void GradTensorHolder::add(size_t slot_id, size_t rank,
const paddle::experimental::Tensor& t,
bool fill_one) {
......
......@@ -56,6 +56,8 @@ class GradTensorHolder {
return buffer_;
}
void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
private:
std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
};
......
......@@ -98,6 +98,8 @@ class TensorWrapper {
}
}
void clear() { intermidiate_tensor_.reset(); }
private:
bool full_reserved_ = false;
std::pair<size_t, size_t> out_rank_info_;
......
......@@ -17,6 +17,14 @@
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
#endif
namespace eager_test {
using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
......@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
t4.reset();
CHECK(t4.defined() == false);
VLOG(6) << "Check Tensor Copy_";
std::vector<int64_t> rows = {1, 2};
std::vector<int64_t> dims = {2};
paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
->mutable_value()
->Resize(phi::make_ddim(dims));
auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
->mutable_value()
->mutable_data<float>(paddle::platform::CPUPlace());
dt7_tmp_ptr[0] = 6.0f;
dt7_tmp_ptr[1] = 11.0f;
paddle::experimental::Tensor t8;
paddle::experimental::Tensor t5;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::experimental::Tensor t6;
paddle::experimental::Tensor t9;
VLOG(6) << "Check Tensor Copy_ Selected Rows";
t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
t9.copy_(t8, paddle::platform::CPUPlace(), true);
auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
->value()
.data<float>();
CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
2);
VLOG(6) << "Check Tensor Copy_ Dense Tensor";
t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
t6.copy_(t5, paddle::platform::CPUPlace(), true);
auto* dt6_tmp_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
#else
t5.copy_(t3, paddle::platform::CPUPlace(), true);
auto* dt5_tmp_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
#endif
VLOG(6) << "Finish";
}
......@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override {
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0];
phi::DenseTensorMeta meta =
......@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
return res;
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
float val_;
};
} // namespace eager_test
......@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 10)
......@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2)
......@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2)
......@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
std::vector<paddle::experimental::Tensor> target_tensors = {Out};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
if (accuracy_check) {
std::unordered_map<std::string, float> result =
......
......@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
......
......@@ -33,6 +33,7 @@
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr {
......@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
}
std::vector<paddle::experimental::Tensor> outs = {target_tensor};
// Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
......@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
}
// Run Backward
RunBackward(target_tensors, grad_tensors);
Backward(target_tensors, grad_tensors);
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
......@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
}
// Use Empty Grad Tensor
RunBackward(target_tensors, {});
Backward(target_tensors, {});
// Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
......@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
node2_ptr->AddEdges(&res2, 0);
}
RunBackward(target_tensors, grad_tensors);
Backward(target_tensors, grad_tensors);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
}
......
......@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
std::vector<egr::AutogradMeta*> res = {meta};
scale_node_ptr->AddEdges(&res, 0);
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
......
......@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
VLOG(7) << "Target Grad is: "
<< std::static_pointer_cast<phi::DenseTensor>(
......@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
std::vector<paddle::experimental::Tensor> outs = {out1};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
......@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
......@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
// leaf grad
......@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
// Cross Batch Accumulation
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
......@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
......@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
// TODO(jiabin): fix this with add functor
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {});
Backward(outs, {});
// Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
......
......@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) {
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {});
Backward(target_tensors, {});
VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
......@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) {
eager_test::CompareTensorWithValue<float>(output_tensor, 96);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
......@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
eager_test::CompareTensorWithValue<float>(output_tensor, 5);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sstream>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr {
TEST(Grad, SingleNodeEmptyGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor (output)
paddle::experimental::Tensor output_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
// Create input tensor
const paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Output_tensor set GradNode、OutRank、StopGradient propertis
AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Get autograd_meta from input tensor
AutogradMeta* auto_grad_meta1 =
EagerUtils::unsafe_autograd_meta(leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
// input tensor set GradNode、OutRank、StopGradient propertis
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// grad_node Add Edges
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
std::vector<paddle::experimental::Tensor> outs = {output_tensor};
// Run Grad
auto result = Grad(outs, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 5.0);
}
TEST(Grad, SingleNodeCustomGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
std::vector<paddle::experimental::Tensor> grad_tensors;
// Create Grad Tensor
paddle::experimental::Tensor grad_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Connect Tensor and Node via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node1
|
Node0
|
{ } // empty grad tensor
*/
TEST(Grad, LinearNodes) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta for node0
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
// Set grad in/out meta for node1
node1_ptr->SetDefaultGradInOutMeta();
// Connect Input Tensor and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Connect Node0 -> Node1 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node1_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
node1_ptr->AddEdges(&res1, 0);
}
// Use Empty Grad Tensor
auto result = Grad(target_tensors, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node2
| |
Node0 Node1
| |
in0 in1
*/
TEST(Grad, WithAccumulation) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor0));
target_tensors.emplace_back(std::move(tensor1));
// Create Grad Tensor
std::vector<paddle::experimental::Tensor> grad_tensors;
paddle::experimental::Tensor grad_tensor0 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor grad_tensor1 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor0));
grad_tensors.emplace_back(std::move(grad_tensor1));
paddle::experimental::Tensor leaf_tensor;
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
node1_ptr->SetDefaultGradInOutMeta();
// Create Node2
auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
node2_ptr->SetAttributes_scale(20.0 /*scale*/);
node2_ptr->SetDefaultGradInOutMeta();
// Connect Inp0 and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta0 =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta0->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta0->SetStopGradient(false);
// Connect Inp1 and Node1 via AutoGradMeta
AutogradMeta* auto_grad_meta1 =
EagerUtils::autograd_meta(&(target_tensors[1]));
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// Connect Node0 -> Node2 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
// Connect Node1 -> Node2 via Edge
auto meta1 = egr::AutogradMeta();
meta1.SetStopGradient(false);
meta1.SetSingleOutRankWithSlot(0, 0);
meta1.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res1 = {&meta1};
node1_ptr->AddEdges(&res1, 0);
AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
auto_grad_meta2->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta2->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
node2_ptr->AddEdges(&res2, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
}
} // namespace egr
......@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
......@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
}
......
......@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
}
VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {});
Backward(target_tensors, {});
VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(
......@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(
......@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(
......
......@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
~GradNodeRunProgram() override = default;
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
override {
const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
bool create_graph) override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ(
grads.size(), 1,
......@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
// SetAttrMap
void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
attrs_ = attrs;
......
......@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
pass_library(add_support_int8_pass inference)
pass_library(matmul_scale_fuse_pass inference)
pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
pass_library(mixed_precision_configure_pass inference)
pass_library(generate_pass DEPS pass_desc_proto)
target_link_libraries(generate_pass pass_desc_proto)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace ir {
void MixedPrecisionConfigurePass::InsertCastOps(
Graph* graph, const StringSet& blacklist) const {
VLOG(3) << "Insert the cast op before and after the kernel that does not "
"supports fp16 precision";
auto update_cast_desc = [&](
framework::OpDesc& desc, const std::string& x_name,
const std::string& out_name, const int in_dtype, const int out_dtype) {
desc.SetType("cast");
desc.SetInput("X", {x_name});
desc.SetOutput("Out", {out_name});
desc.SetAttr("in_dtype", in_dtype);
desc.SetAttr("out_dtype", out_dtype);
desc.SetAttr("use_mkldnn", false);
desc.SetAttr("with_quant_attr", false);
desc.Flush();
};
auto cast_input = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto inlinks = op_node->inputs;
for (auto* pre_node : inlinks) {
if (pre_node->IsVar()) {
const auto is_persistable = pre_node->Var()->Persistable();
const auto is_float =
pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
pre_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* pre_node_input : pre_node->inputs) {
if (!pre_node_input->IsOp()) continue;
const auto& type = pre_node_input->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = pre_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
op_node->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(pre_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, op_node);
}
}
}
}
}
};
auto cast_output = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto outlinks = op_node->outputs;
for (auto* next_node : outlinks) {
if (next_node->IsVar()) {
const auto is_persistable = next_node->Var()->Persistable();
const auto is_float =
next_node->Var()->GetDataType() == proto::VarType::FP16 ||
next_node->Var()->GetDataType() == proto::VarType::FP32 ||
next_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* next_node_output : next_node->outputs) {
if (!next_node_output->IsOp()) continue;
const auto& type = next_node_output->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = next_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
next_node_output->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(next_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, next_node_output);
}
}
}
}
}
};
for (auto* op_node :
ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
const auto& type = op_node->Op()->Type();
if (blacklist.count(type)) {
cast_input(graph, op_node, blacklist);
cast_output(graph, op_node, blacklist);
}
}
}
void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
const auto blacklist =
Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
InsertCastOps(graph, blacklist);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mixed_precision_configure_pass,
paddle::framework::ir::MixedPrecisionConfigurePass);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
using StringSet = std::unordered_set<std::string>;
class MixedPrecisionConfigurePass : public FusePassBase {
public:
MixedPrecisionConfigurePass() = default;
virtual ~MixedPrecisionConfigurePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
private:
void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -188,6 +188,9 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
std::unordered_set<std::string>);
// Usually use for trt dynamic shape.
// TRT will select the best kernel according to opt shape
......
......@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
new int(argument->dlnne_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
} else if (pass_name == "mixed_precision_configure_pass") {
pass->Set("gpu_fp16_disabled_op_types",
new std::unordered_set<std::string>(
argument->gpu_fp16_disabled_op_types()));
}
if (pass_name == "lite_subgraph_pass") {
bool lite_enable_int8 =
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
#else
void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
const framework::ir::Graph &graph,
std::unordered_map<std::string, std::string> *var_name_op_type_map) {
std::vector<framework::ir::Node *> node_list =
framework::ir::TopologyVarientSort(
graph, static_cast<framework::ir::SortKind>(0));
for (auto *op_node : node_list) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
for (auto *pre_node : op_node->inputs) {
if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
var_name_op_type_map->insert(std::pair<std::string, std::string>(
pre_node->Var()->Name(), op_node->Op()->Type()));
}
}
}
}
void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
......@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
if (with_dynamic_shape) {
reserve_cpu_weights = true;
}
bool mixed_precision_mode =
argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
std::unordered_map<std::string, std::string> var_name_op_type_map{};
std::unordered_set<std::string> blacklist{};
if (mixed_precision_mode) {
GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
blacklist = argument->gpu_fp16_disabled_op_types();
}
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
......@@ -117,20 +148,31 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>();
bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
t->dtype() == paddle::experimental::DataType::FLOAT64;
if (mixed_precision_mode &&
!blacklist.count(var_name_op_type_map[var_name]) && is_float) {
framework::Tensor half_tensor;
half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
half_tensor.Resize(t->dims());
auto *half_data =
half_tensor.mutable_data<float16>(platform::CPUPlace());
for (int i = 0; i < t->numel(); i++) {
auto *data = t->mutable_data<float>(platform::CPUPlace());
half_data[i] = static_cast<float16>(data[i]);
}
t->clear();
paddle::framework::TensorCopySync(half_tensor, place, t);
} else {
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);
// Copy the parameter data to a tmp tensor.
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->clear();
// Copy parameter data to newly allocated GPU space.
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
}
#endif
......
......@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
#ifdef PADDLE_WITH_ASCEND_CL
void CopyParamsToNpu(Argument *argument);
#else
void CopyParamsToGpu(Argument *argument);
void GetVarNameToOpTypeMap(
const framework::ir::Graph& graph,
std::unordered_map<std::string, std::string>* var_name_op_type_map);
void CopyParamsToGpu(Argument* argument);
#endif
};
......
......@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
Update();
}
void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
Update();
}
void AnalysisConfig::DisableGpu() {
use_gpu_ = false;
Update();
}
void AnalysisConfig::Exp_EnableUseGpuFp16(
std::unordered_set<std::string> op_list) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
use_gpu_fp16_ = true;
gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
#else
LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
use_gpu_fp16_ = false;
#endif
Update();
}
void AnalysisConfig::DisableFCPadding() {
use_fc_padding_ = false;
......@@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_cudnn_);
CP_MEMBER(gpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(use_gpu_fp16_);
CP_MEMBER(gpu_fp16_disabled_op_types_);
CP_MEMBER(enable_memory_optim_);
// TensorRT related.
......@@ -573,6 +590,20 @@ void AnalysisConfig::Update() {
#endif
}
if (use_gpu_fp16_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) {
LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
"enabled.";
} else if (!use_gpu()) {
LOG(ERROR)
<< "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
} else {
pass_builder()->Exp_EnableUseGpuFp16();
}
#endif
}
if (use_mkldnn_) {
#ifdef PADDLE_WITH_MKLDNN
if (!enable_ir_optim_) {
......@@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << params_file_;
ss << use_gpu_;
ss << use_gpu_fp16_;
for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
ss << use_fc_padding_;
ss << gpu_device_id_;
ss << xpu_device_id_;
......
......@@ -872,6 +872,11 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
}
if (config_.gpu_fp16_enabled()) {
argument_.SetUseGPUFp16(true);
argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
}
if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
......
......@@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
ASSERT_TRUE(!config.use_onnxruntime());
}
TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
AnalysisConfig config;
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
ASSERT_TRUE(config.gpu_fp16_enabled());
#else
config.DisableGpu();
#endif
LOG(INFO) << config.Summary();
}
} // namespace paddle
namespace paddle_infer {
......@@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) {
auto predictor = CreatePredictor(config);
}
TEST(Predictor, Exp_EnableUseGpuFp16) {
Config config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
#else
config.DisableGpu();
#endif
auto predictor = CreatePredictor(config);
}
TEST(Tensor, CpuShareExternalData) {
Config config;
config.SetModel(FLAGS_dirname);
......
......@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
void DisableGpu();
///
/// \brief Enable GPU fp16 precision computation, in experimental state.
///
/// \param op_list The operator type list.
///
void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
///
/// \brief A boolean state telling whether the GPU fp16 precision is turned
/// on.
///
/// \return bool Whether the GPU fp16 precision is turned on.
///
bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
///
/// \brief Turn on XPU.
......@@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
int gpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool thread_local_stream_{false};
bool use_gpu_fp16_{false};
std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
"conv2d_fusion", "conv2d", "roll", "strided_slice"};
bool use_cudnn_{false};
......
......@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
use_cudnn_ = true;
}
void GpuPassStrategy::Exp_EnableUseGpuFp16() {
passes_.assign({
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"embedding_eltwise_layernorm_fuse_pass", //
"multihead_matmul_fuse_pass_v2", //
"gpu_cpu_squeeze2_matmul_fuse_pass", //
"gpu_cpu_reshape2_matmul_fuse_pass", //
"gpu_cpu_flatten2_matmul_fuse_pass", //
"gpu_cpu_map_matmul_v2_to_mul_pass", //
"gpu_cpu_map_matmul_v2_to_matmul_pass", //
"gpu_cpu_map_matmul_to_mul_pass", //
// "fc_fuse_pass", //
"fc_elementwise_layernorm_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
// disable the pass.
#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
#endif
"conv_elementwise_add_fuse_pass", //
#endif //
"transpose_flatten_concat_fuse_pass", //
"mixed_precision_configure_pass", //
"runtime_context_cache_pass" //
});
use_gpu_fp16_ = true;
}
void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
......
......@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable the use of cuDNN kernel.
virtual void EnableCUDNN() {}
/// \brief Enable use gpu fp16 kernel.
virtual void Exp_EnableUseGpuFp16() {}
/// \brief Enable the use of MKLDNN.
/// The MKLDNN control exists in both CPU and GPU mode, because there can
/// still be some CPU kernels running in GPU mode.
......@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; }
/// \brief Check if we are using gpu fp16 kernel.
/// \return A bool variable implying whether we are in gpu fp16 mode.
bool use_gpu_fp16() const { return use_gpu_fp16_; }
/// \brief Check if we are using xpu.
/// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; }
......@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
bool use_npu_{false};
bool use_ipu_{false};
bool use_mkldnn_{false};
bool use_gpu_fp16_{false};
/// \endcond
};
......@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Enable the use of cuDNN kernel.
void EnableCUDNN() override;
/// \brief Enable the use of gpu fp16 kernel.
void Exp_EnableUseGpuFp16() override;
/// \brief Not supported in GPU mode yet.
void EnableMKLDNN() override;
......@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
protected:
/// \cond Protected
bool use_cudnn_{false};
bool use_gpu_fp16_{false};
/// \endcond
};
......
......@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
extern __shared__ char* shared_max_data_tmp[];
auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
if (gridDim.x > 1) {
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T tmp = abs(in[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
} else {
if (bid < n) {
shared_max_data[tid] = abs(in[bid]);
......@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
extern __shared__ T shared_max_data[];
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = tid; i < channel_size; i += blockDim.x) {
T tmp = fabs(in_c[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
......@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
int tid = threadIdx.x;
int bid = blockIdx.x;
const T* in_current = in + tid * cout_wh_size + bid * wh_size;
shared_max_data[tid] = T(0);
T local_max_data = T(0);
for (int i = 0; i < wh_size; i++) {
T tmp = fabs(in_current[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
int len = blockDim.x;
......@@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
}
};
template <typename T>
__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
const T* in_accum,
const T* cur_scale, const T rate,
T* out_state, T* out_accum,
T* out_scale) {
T state = rate * (*in_state) + T(1.0f);
T accum = rate * (*in_accum) + (*cur_scale);
*out_state = state;
*out_accum = accum;
*out_scale = accum / state;
}
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
......@@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T accum;
T state;
T scale;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
ctx.stream());
ctx.Wait();
T rate_t = static_cast<T>(rate);
state = rate_t * state + static_cast<T>(1.0);
accum = rate_t * accum + scale;
scale = accum / state;
memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
platform::CPUPlace(), &state, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
ctx.Wait();
T* out_state_data = out_state->mutable_data<T>(gpu_place);
T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
out_state_data, out_accum_data, out_scale_data);
}
};
......
......@@ -15,9 +15,13 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
class GridSampleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
auto x_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
PADDLE_ENFORCE_EQ(x_dims.size(), 4,
platform::errors::InvalidArgument(
"Input(X) of GridSampleOp should be 4-D Tensor, but "
"received X dimension size(%d)",
x_dims.size()));
PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
platform::errors::InvalidArgument(
"Input(Grid) of GridSampleOp should be 4-D Tensor, "
"but received X dimension size(%d)",
grid_dims.size()));
if (ctx->IsRuntime() || grid_dims[3] > 0) {
PADDLE_ENFORCE_EQ(
grid_dims[3], 2,
platform::errors::InvalidArgument(
"Input(Grid) dimension[3] should be 2, but received %d",
grid_dims[3]));
}
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(
grid_dims[0], x_dims[0],
platform::errors::InvalidArgument(
"Input(X) and Input(Grid) dimension[0] should be equal, but "
"received X dimension[0](%d) != Grid dimension[0](%d)",
x_dims[0], grid_dims[0]));
}
ctx->SetOutputDim("Output",
{x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
ctx->ShareLoD("X", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
class GridSampleOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "grid_sampler");
auto input_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
}
if (ctx->HasOutput(framework::GradVarName("Grid"))) {
ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -224,10 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
PD_INFER_META(phi::GridSampleBaseInferMeta));
REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
ops::GridSampleGradMaker<paddle::framework::OpDesc>,
ops::GridSampleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
ops::GridSampleGradMaker<paddle::imperative::OpBase>,
GridSamplerInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
PD_INFER_META(phi::GeneralBinaryGradInferMeta));
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
GridSamplerGradInferShapeFunctor);
REGISTER_OP_VERSION(grid_sampler)
.AddCheckpoint(
......
......@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
egr::RunBackward(tensors, grad_tensors,
egr::Backward(tensors, grad_tensors,
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
Py_INCREF(Py_None);
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
std::vector<paddle::experimental::Tensor> result =
egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
only_inputs, allow_unused, no_grad_vars);
VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
return ToPyObject(result, true /* return_py_none_if_not_initialize */);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = {
METH_VARARGS | METH_KEYWORDS, NULL},
{"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
METH_VARARGS | METH_KEYWORDS, NULL},
{"run_partial_grad",
(PyCFunction)(void (*)(void))eager_api_run_partial_grad,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
METH_VARARGS | METH_KEYWORDS, NULL},
{"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
......
......@@ -226,6 +226,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto cp_tensor =
self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
egr::EagerUtils::autograd_meta(&cp_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(cp_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
......@@ -264,7 +277,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
}
self->tensor.copy_(src_tensor, blocking);
self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
<< self->tensor.name();
......
......@@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
"Detected NULL grad"
"Please check if you have manually cleared"
"the grad inside autograd_meta"));
grad->copy_(src, true);
grad->copy_(src, self->tensor.inner_place(), true);
return 0;
EAGER_CATCH_AND_THROW_RETURN_ZERO
}
......
......@@ -492,10 +492,15 @@ PyObject* ToPyObject(const std::vector<double>& value) {
return result;
}
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize) {
PyObject* result = PyList_New((Py_ssize_t)value.size());
for (size_t i = 0; i < value.size(); i++) {
if (!value[i].initialized() && return_py_none_if_not_initialize) {
Py_INCREF(Py_None);
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
} else {
PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
if (obj) {
auto v = reinterpret_cast<TensorObject*>(obj);
......@@ -507,6 +512,7 @@ PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
}
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
}
}
return result;
}
......
......@@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector<int>& value);
PyObject* ToPyObject(const std::vector<int64_t>& value);
PyObject* ToPyObject(const std::vector<float>& value);
PyObject* ToPyObject(const std::vector<double>& value);
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize = false);
PyObject* ToPyObject(const platform::Place& value);
PyObject* ToPyObject(const framework::LoDTensor* value);
PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
......
......@@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
.def("params_file", &AnalysisConfig::params_file)
.def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
.def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
py::arg("gpu_fp16_disabled_op_types") =
std::unordered_set<std::string>({}))
.def("enable_xpu", &AnalysisConfig::EnableXpu,
py::arg("l3_workspace_size") = 16 * 1024 * 1024,
py::arg("locked") = false, py::arg("autotune") = true,
......
......@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
endif()
option(INFRT_WITH_PHI "Compile INFRT with PHI" ON)
option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF)
option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF)
#TODO(xiaowei) remove fluid
include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
if (INFRT_WITH_PHI)
add_definitions("-DINFRT_WITH_PHI")
# TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
if (INFRT_WITH_GPU)
add_definitions("-DINFRT_WITH_GPU")
if (INFRT_WITH_TRT)
add_definitions("-DINFRT_WITH_TRT")
endif()
endif()
endif()
# compile flags
......@@ -92,7 +102,6 @@ set(infrt_mlir_incs
test_kernels_inc
tensor_shape_inc
dense_tensor_inc
pd_ops_inc
pd_extra_ops_inc
trt_ops_inc
)
......@@ -106,6 +115,9 @@ if (INFRT_WITH_PHI)
endif()
cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
if (INFRT_WITH_TRT)
target_link_libraries(infrt infrt_trt)
endif()
cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
......
......@@ -13,6 +13,10 @@ limitations under the License. */
#include "paddle/phi/core/allocator.h"
#ifdef INFRT_WITH_GPU
#include <cuda_runtime.h>
#endif
namespace infrt {
namespace backends {
......@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
}
};
#ifdef INFRT_WITH_GPU
// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
class GpuPhiAllocator : public phi::Allocator {
public:
static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
AllocationPtr Allocate(size_t bytes_size) {
void* ptr;
cudaMalloc(&ptr, bytes_size);
return AllocationPtr(
new phi::Allocation(
ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
deleter);
}
};
#endif
} // namespace backends
} // namespace infrt
......@@ -13,6 +13,7 @@ limitations under the License. */
#include "paddle/infrt/backends/host/phi_allocator.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace infrt {
namespace backends {
......@@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext {
std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
};
class GpuPhiContext : public phi::GPUContext {
public:
using Base = phi::GPUContext;
using phi::GPUContext::SetStream;
using phi::GPUContext::SetEigenDevice;
using phi::GPUContext::SetBlasHandle;
using phi::GPUContext::SetDnnHandle;
using phi::GPUContext::SetSolverHandle;
using phi::GPUContext::SetSparseHandle;
};
} // namespace backends
} // namespace infrt
......@@ -37,9 +37,9 @@ namespace infrt {
namespace backends {
namespace tensorrt {
const char* model_input = "model_input";
const char* model_output = "model_output1";
const char* model_output2 = "model_output2";
const char* model_input = "input_0";
const char* model_output = "output_0";
const char* model_output2 = "output_1";
TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
......@@ -122,27 +122,26 @@ TEST(trt, run_static) {
std::unordered_map<std::string, phi::DenseTensor*> inputs;
inputs.emplace(std::make_pair(model_input, &input));
phi::DenseTensor output, output2;
std::unordered_map<std::string, phi::DenseTensor*> outputs;
outputs.emplace(std::make_pair(model_output, &output));
outputs.emplace(std::make_pair(model_output2, &output2));
static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
static_trt_engine.PrepareOutputHandle("output_0");
static_trt_engine.PrepareOutputHandle("output_1");
static_trt_engine.SetUpInference(inference_options, inputs);
static_trt_engine.GetEngineInfo();
static_trt_engine.Run(context);
phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
paddle::memory::Copy(phi::CPUPlace(),
output_data1.data(),
place,
output.data<float>(),
output0->data<float>(),
sizeof(float) * output_data1.size(),
context.stream());
paddle::memory::Copy(phi::CPUPlace(),
output_data2.data(),
place,
output2.data<float>(),
output1->data<float>(),
sizeof(float) * output_data2.size(),
context.stream());
cudaStreamSynchronize(context.stream());
......@@ -208,27 +207,27 @@ TEST(trt, run_dynamic) {
context.stream());
std::unordered_map<std::string, phi::DenseTensor*> inputs;
std::unordered_map<std::string, phi::DenseTensor*> outputs;
inputs.emplace(std::make_pair(model_input, &input));
outputs.emplace(std::make_pair(model_output, &output));
outputs.emplace(std::make_pair(model_output2, &output2));
engine.SetUpInference(inference_options, inputs, &outputs);
engine.PrepareOutputHandle("output_0");
engine.PrepareOutputHandle("output_1");
engine.SetUpInference(inference_options, inputs);
engine.GetEngineInfo();
engine.Run(context);
phi::DenseTensor* output0 = engine.GetOutput("output_0");
phi::DenseTensor* output1 = engine.GetOutput("output_1");
std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
paddle::memory::Copy(phi::CPUPlace(),
output_data1.data(),
place,
output.data<float>(),
output0->data<float>(),
sizeof(float) * output_data1.size(),
context.stream());
paddle::memory::Copy(phi::CPUPlace(),
output_data2.data(),
place,
output2.data<float>(),
output1->data<float>(),
sizeof(float) * output_data2.size(),
context.stream());
cudaStreamSynchronize(context.stream());
......
......@@ -21,6 +21,7 @@
#include "paddle/phi/backends/dynload/tensorrt.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
namespace infrt {
namespace backends {
......@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
return true;
}
void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
phi::DenseTensor t;
outputs_.emplace(out_name, t);
}
phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
return &outputs_[name];
}
size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
bool TrtEngine::SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
// TODO(wilber): now only create one exec_context
FreshDeviceId();
CHECK(engine_ != nullptr);
......@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
bindings_.front()->AddBinding(
bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
}
for (auto& it : *outputs) {
for (auto& it : outputs_) {
const int bind_index = engine_->getBindingIndex(it.first.c_str());
bindings_.front()->AddBinding(
bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
}
return true;
......@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
const int bind_index = engine_->getBindingIndex(bind.name.c_str());
std::vector<int32_t> ddim;
auto dims = engine_->getBindingDimensions(bind_index);
CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
ddim.push_back(runtime_batch);
for (int i = 0; i < dims.nbDims; ++i) {
ddim.push_back(dims.d[i]);
}
bind.buffer->Resize(phi::make_ddim(ddim));
// TODO(wilber): now only support float output.
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
}
......
......@@ -81,11 +81,17 @@ class TrtEngine {
// TODO(wilber): How to support multiple execution contexts?
bool SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
std::unordered_map<std::string, phi::DenseTensor*>* outputs);
const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
void GetEngineInfo();
void PrepareOutputHandle(const std::string& out_name);
// TODO(wilber): The output tensor names are: output_0, output_1, ...
phi::DenseTensor* GetOutput(const std::string&);
size_t GetOutputNum() const;
private:
void FreshDeviceId();
......@@ -112,6 +118,7 @@ class TrtEngine {
std::vector<std::unique_ptr<Bindings>> bindings_;
int device_id_{0};
bool is_dynamic_shape_{false};
std::unordered_map<std::string, phi::DenseTensor> outputs_;
};
} // namespace tensorrt
......
......@@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS
dense_tensor.cc
mlir_loader.cc
diagnostic_utils.cc
pd_ops.cc
)
mlir_tablegen_on(tensor_shape DIALECT ts)
mlir_tablegen_on(dense_tensor DIALECT dt)
mlir_tablegen_on(pd_op_base DIALECT pd)
mlir_tablegen_on(pd_ops)
mlir_tablegen_on(pd_extra_ops)
mlir_add_rewriter(rewrite)
# TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
add_executable(infrtopt opt.cc)
......@@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt)
add_executable(print-ir print_ir.cc)
target_link_libraries(print-ir infrt ${mlir_libs})
add_dependencies(print-ir pd_ops_inc)
cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
add_subdirectory(infrt)
add_subdirectory(pd)
add_subdirectory(tensorrt)
if (INFRT_WITH_PHI)
......
......@@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
}
def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
let summary = "ddt.tensor_map_get_size operation";
let summary = "dt.tensor_map_get_size operation";
let description = [{
An operation that get the size of a TensorMap.
......@@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
}
def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
let summary = "dt.tensor_list_get_tensor operation";
let description = [{
An operation that can get a tensor from a TensorList.
}];
let arguments = (ins
DenseTensorList:$l,
I32Attr:$id
);
let results = (outs DenseTensor:$output);
let verifier = ?;
}
def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
let summary = "dt.tensor_list_get_size operation";
let description = [{
An operation that get the size of a TensorList.
}];
let arguments = (ins DenseTensorList:$map);
let results = (outs I32:$size);
}
def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
let summary = "dt.get_tensor_shape operation";
......
......@@ -89,6 +89,13 @@ def DenseTensorMap : Infrt_Type<"DenseTensorMap"> {
let parameters = (ins);
}
// TODO(wilber): Add !infrt.vec type.
def DenseTensorList : Infrt_Type<"DenseTensorList"> {
let summary = "infrt dense tensor map";
let description = [{dense_tensor map}];
let parameters = (ins);
}
// Type Constrait for concrete DenseTensor type.
class DenseTensor<string target, string precision, string layout> :
Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">,
......
......@@ -138,6 +138,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
parser.getContext(), *targetType, *precisionType, *layoutType);
}
if (keyword == "tensor_list") {
return infrt::DenseTensorListType::get(parser.getContext());
}
if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext());
}
......@@ -175,6 +179,9 @@ void InfrtDialect::printType(::mlir::Type type,
return;
}
if (type.isa<infrt::DenseTensorListType>()) {
os << "tensor_list";
}
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorMapType>()) {
os << "dense_tensor_map";
......
......@@ -3,7 +3,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
def FuseTensorCastPattern : Pat<
(Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
......
......@@ -16,7 +16,7 @@
#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace {
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT
......
......@@ -20,12 +20,13 @@
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
#include "paddle/infrt/dialect/tensor_shape.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace infrt {
void registerCinnDialects(mlir::DialectRegistry &registry) { // NOLINT
......@@ -37,7 +38,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) { // NOLINT
phi::PHIDenseTensorDialect,
phi::PHICPUKernelDialect,
phi::PHIGPUKernelDialect,
phi::PHIDialect
phi::PHIDialect,
infrt::trt::TensorRTDialect
#endif
>();
}
......
add_subdirectory(common)
add_subdirectory(ir)
add_subdirectory(pass)
core_gather_headers()
gather_srcs(infrt_src SRCS
)
core_gather_headers()
gather_srcs(infrt_src SRCS
pd_ops.cc
)
add_mlir_dialect(pd_ops pd)
mlir_tablegen_on(pd_extra_ops)
......@@ -4,7 +4,7 @@
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/LoopLikeInterface.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/pd_op_base.td"
include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
let summary = "Computes the Fully Connected result of two tensors";
......
......@@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
def PD_Dialect : Dialect {
def Paddle_Dialect : Dialect {
let name = "pd";
let description = [{
......@@ -16,12 +16,12 @@ def PD_Dialect : Dialect {
This dialect contains the PaddlePaddle operators.
}];
let hasConstantMaterializer = 1;
let cppNamespace = "mlir::pd";
}
class PD_Op<string mnemonic, list<OpTrait> traits = []> :
Op<PD_Dialect, mnemonic, traits>;
Op<Paddle_Dialect, mnemonic, traits>;
class PD_PaddleAttr <string name, string description> :
......
......@@ -12,29 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include <mlir/IR/Matchers.h>
#include <mlir/IR/PatternMatch.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT
namespace mlir {
namespace pd {
#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT
PaddleDialect::PaddleDialect(MLIRContext *context)
: Dialect("pd", context, TypeID::get<PaddleDialect>()) {
void PaddleDialect::initialize() {
addOperations<
#define GET_OP_LIST
#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT
,
#define GET_OP_LIST
#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT
>();
}
......@@ -73,106 +71,5 @@ mlir::OpFoldResult ConstantOp::fold(
::llvm::ArrayRef<mlir::Attribute> operands) {
return value();
}
/*
LogicalResult ElementwiseAdd::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
*/
void Elementwise_addOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseMulAdd>(context);
}
/*
mlir::OpFoldResult ElementwiseAdd::fold(
llvm::ArrayRef<mlir::Attribute> operands) {
if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
if (!operands[0] || !operands[1]) return {};
DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
if (!lhs || !rhs) return {};
ShapedType type = getType().template cast<ShapedType>();
if (!type.hasStaticShape()) return {};
Type etype = type.getElementType();
if (!etype.isa<FloatType>()) return {};
SmallVector<APFloat, 6> values;
values.reserve(lhs.getNumElements());
for (const auto zip :
llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
values.push_back(
std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
}
return DenseElementsAttr::get(type, values);
}
return {};
}
LogicalResult ElementwiseDiv::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult ElementwiseMul::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult ElementwiseSub::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
LogicalResult MulOp::inferReturnTypes(
MLIRContext *context,
Optional<Location> location,
ValueRange operands,
DictionaryAttr attributes,
RegionRange regions,
SmallVectorImpl<Type> &inferredReturnTypes) {
inferredReturnTypes.push_back(operands[0].getType());
return success();
}
void ReluOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseFCRelu>(context);
}
void FusedRepeatedFCRelu::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseRepeatedFCRelu2>(context);
}
void BatchNormOp::getCanonicalizationPatterns(
mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
results.insert<FuseBatchNormWithConvPattern>(context);
}*/
} // namespace pd
} // namespace mlir
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,49 +14,20 @@
#pragma once
#include <mlir/Dialect/Traits.h>
#include <mlir/IR/Attributes.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/BuiltinOps.h>
//===----------------------------------------------------------------------===//
// Dialect
//===----------------------------------------------------------------------===//
#include <llvm/ADT/StringMap.h>
#include <mlir/IR/BuiltinTypes.h>
#include <mlir/IR/Dialect.h>
#include <mlir/IR/Matchers.h>
#include <mlir/IR/OpDefinition.h>
#include <mlir/IR/OpImplementation.h>
#include <mlir/IR/TypeUtilities.h>
#include <mlir/Interfaces/CallInterfaces.h>
#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
#include <mlir/Interfaces/InferTypeOpInterface.h>
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
namespace mlir {
namespace pd {
class PaddleDialect : public Dialect {
public:
explicit PaddleDialect(MLIRContext* context);
static StringRef getDialectNamespace() { return "pd"; }
/// A hook used to materialize constant values with the given type.
Operation* materializeConstant(OpBuilder& builder,
Attribute value,
Type type,
Location loc) override;
Type parseType(DialectAsmParser& parser) const override {
return Dialect::parseType(parser);
}
void printType(Type type, DialectAsmPrinter& printer) const override {
Dialect::printType(type, printer);
}
};
} // namespace pd
} // namespace mlir
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_ops.hpp.inc"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
core_gather_headers()
gather_srcs(infrt_src SRCS
pd_op_fuse_pass.cc
)
mlir_add_rewriter(pd_op_fuse)
......@@ -3,8 +3,8 @@
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd_extra_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
//===----------------------------------------------------------------------===//
// This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT
#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace {
#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc" // NOLINT
/*
* PdOpFusePass.
*/
struct PdOpFusePass
: public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
public:
::llvm::StringRef getName() const override { return "PdOpFusePass"; }
llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
void runOnFunction() override;
};
// Implementation of the PdOpFusePass.
void PdOpFusePass::runOnFunction() {
::mlir::RewritePatternSet patterns(&getContext());
populateWithGenerated(patterns);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
} // namespace
mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mlir/Pass/Pass.h>
namespace infrt {
/*
* PdOpFusePass.
*/
std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
} // namespace infrt
......@@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect {
class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
class CreateDenseTensorOp
: PDT_Op<"create_dense_tensor", [NoSideEffect]> {
class CreateDenseTensorOp<string target>
: PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
let arguments = (ins Context:$context, I64ArrayAttr:$dims,
LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
let results = (outs DenseTensor:$output);
......@@ -51,9 +51,11 @@ class CreateContextOp<string target>
let results = (outs Context:$output);
}
def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
def PDT_PrintDenseTensor : PrintDenseTensorOp;
def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
......
......@@ -32,6 +32,7 @@
#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/core/kernel_factory.h"
#include "paddle/phi/ops/compat/signatures.h"
namespace {
......@@ -94,7 +95,17 @@ void PhiOpConvertPass::convertStage() {
// Todo: print log
continue;
}
auto loc = getFunction().getLoc();
builder.setInsertionPoint(op);
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
std::string kernel_name = phi::TransToPhiKernelName(op_name);
auto kernel_op = builder.create<infrt::KernelOp>(loc,
op->getResultTypes(),
op->getOperands(),
kernel_name,
op->getAttrDictionary());
op->replaceAllUsesWith(kernel_op.getResults());
} else {
::phi::KernelSignature kernel_sign =
::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
infrt::ProtoArgumentMappingContext(op));
......@@ -121,15 +132,12 @@ void PhiOpConvertPass::convertStage() {
output_types.push_back(op->getResultTypes()[index]);
ori_output.push_back(op->getResult(index));
}
auto loc = getFunction().getLoc();
builder.setInsertionPoint(op);
auto kernel_op = builder.create<infrt::KernelOp>(
loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
for (size_t index = 0; index < ori_output.size(); ++index) {
ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
}
}
CHECK(op->use_empty());
op->erase();
}
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <mlir/IR/Operation.h>
#include <unordered_map>
#include "paddle/infrt/dialect/pd_ops_info.h"
#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
#include "paddle/phi/core/compat/arg_map_context.h"
namespace infrt {
......
......@@ -3,7 +3,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "paddle/infrt/dialect/pd_ops.td"
include "paddle/infrt/dialect/pd/ir/pd_ops.td"
include "paddle/infrt/dialect/tensorrt/trt_ops.td"
def PD2TRT_Matmul_Lower : Pat<
......
......@@ -17,11 +17,12 @@
#include <llvm/ADT/SetVector.h>
#include <mlir/Analysis/SliceAnalysis.h>
#include <mlir/IR/Builders.h>
#include <paddle/infrt/dialect/pd_ops.h>
#include <list>
#include <unordered_set>
#include <vector>
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
namespace {
......
......@@ -15,7 +15,7 @@
#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
#include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -14,7 +14,7 @@
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include <mlir/IR/Builders.h>
#include <mlir/Transforms/DialectConversion.h>
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
namespace infrt {
......
......@@ -17,7 +17,7 @@
#include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -21,6 +21,10 @@
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
namespace infrt {
namespace trt {
......
......@@ -30,7 +30,7 @@
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
namespace infrt {
namespace trt {
......
......@@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
let summary = "trt CreateEngine Op";
......@@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<
Describe a tensorrt subgraph.
}];
let regions = (region SizedRegion<1>:$body);
let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
let results = (outs TRT_EngineType:$output);
let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
let results = (outs TRT_EngineType:$engine);
}
def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
......@@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
let description = [{
Describe a tensorrt runtime.
}];
let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
let results = (outs Variadic<TRT_Tensor>:$output);
let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
let results = (outs Variadic<DenseTensor>:$output);
}
def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
let summary = "trt compute engine";
let description = [{
execute engine
}];
let arguments = (ins TRT_EngineType:$engine, Context:$context);
let results = (outs DenseTensorList:$outputs);
}
def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
let summary = "trt inspect engine";
let description = [{
Show engine
}];
let arguments = (ins TRT_EngineType:$engine);
}
def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
......@@ -34,11 +53,11 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
TensorRT IActivationLayer.
}];
let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type,
let arguments = (ins DenseTensor:$input, SI32Attr:$activation_type,
DefaultValuedAttr<F32Attr, "0.0">:$alpha,
DefaultValuedAttr<F32Attr, "0.0">:$beta);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
......@@ -48,9 +67,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
TensorRT IElementWiseLayer.
}];
let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
let arguments = (ins DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
......@@ -60,10 +79,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
TensorRT IMatrixMultiplyLayer.
}];
let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1,
TRT_Tensor:$input2, BoolAttr:$transpose2);
let arguments = (ins DenseTensor:$input1, BoolAttr:$transpose1,
DenseTensor:$input2, BoolAttr:$transpose2);
let results = (outs TRT_Tensor:$output);
let results = (outs DenseTensor:$output);
}
#endif // TRT_OPS
......@@ -33,7 +33,10 @@
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#endif
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
#include "paddle/infrt/kernel/tensorrt/registry.h"
#endif // INFRT_WITH_GPU && INFRT_WITH_TRT
#endif // INFRT_WITH_PHI
static llvm::cl::list<std::string> cl_shared_libs( // NOLINT
"shared_libs",
......@@ -62,6 +65,9 @@ int main(int argc, char** argv) {
#ifdef INFRT_WITH_PHI
kernel::RegisterPhiKernels(&registry);
kernel::RegisterInferShapeLaunchers(&registry);
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
kernel::RegisterTrtKernels(&registry);
#endif // INFRT_WITH_GPU && INFRT_WITH_TRT
#endif
// load extra shared library
......
......@@ -16,12 +16,14 @@
#include <llvm/Support/SourceMgr.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
#include <mlir/IR/BuiltinAttributes.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/BuiltinTypes.h>
#include <mlir/IR/Diagnostics.h>
#include <mlir/IR/OperationSupport.h>
#include <mlir/Parser.h>
#include <glog/logging.h>
#include <iostream>
#include <memory>
#include <string>
......@@ -42,6 +44,13 @@
#include "paddle/infrt/host_context/value.h"
#include "paddle/infrt/tensor/tensor_shape.h"
#ifdef INFRT_WITH_PHI
#ifdef INFRT_WITH_TRT
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#endif
#include "paddle/phi/core/dense_tensor.h"
#endif
namespace infrt {
namespace host_context {
......@@ -277,7 +286,31 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
// TODO(wilber): Find a more appropriate way to handle special cases.
if (op->getName().getStringRef() == "trt.create_engine") {
#ifdef INFRT_WITH_TRT
auto* symbols = impl_->runtime->symbol_table();
::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
mlir_operation.operation = op;
mlir_operation.symbol_table = symbols;
impl_->cur_op->AppendArgument(new Value(mlir_operation));
// TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
// add a naive implement.
for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
auto operand = op->getOperand(i);
if (operand.isa<mlir::BlockArgument>()) {
mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
Value* arg_value = GetValue(arg);
if (arg_value->is_type<phi::DenseTensor>()) {
impl_->runtime->FeedInArgs(
std::make_pair(std::to_string(i), ValueRef(arg_value)));
}
}
}
#else
CHECK(false) << "should not reach here";
#endif
} else {
// process operands
for (int i = 0, e = op->getNumOperands(); i < e; i++) {
// function argument as value
......@@ -305,6 +338,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
<< GetValue(operand) << " vs " << arg_value;
}
}
// process attributes
auto attrs = op->getAttrs();
......@@ -383,33 +417,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->cur_op->AppendAttribute(tmp[i]);
}
// process results
llvm::SmallVector<Value*, 4> res_values;
for (int i = 0, e = op->getNumResults(); i < e; i++) {
auto res = op->getResult(i);
if (res.getType().isa<::infrt::DenseTensorType>()) {
auto r = impl_->value_map.try_emplace(
res, ValueRef(new Value{::phi::DenseTensor()}));
CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
<< "]";
res_values.push_back(r.first->second.get());
} else {
res_values.push_back(AddValue(res));
}
VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
}
impl_->cur_op->SetResults(res_values);
#ifdef INFRT_DEBUG
{
VLOG(3) << "check result";
for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
}
}
#endif
// process regions, we treat regions as attribute.
auto num_regions = op->getNumRegions();
if (num_regions > 0) {
......@@ -438,6 +445,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
impl_->cur_op->AppendAttribute(new Value(function));
}
// process results
llvm::SmallVector<Value*, 4> res_values;
for (int i = 0, e = op->getNumResults(); i < e; i++) {
auto res = op->getResult(i);
if (res.getType().isa<::infrt::DenseTensorType>()) {
auto r = impl_->value_map.try_emplace(
res, ValueRef(new Value{::phi::DenseTensor()}));
CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
<< "]";
res_values.push_back(r.first->second.get());
} else {
res_values.push_back(AddValue(res));
}
VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
}
impl_->cur_op->SetResults(res_values);
#ifdef INFRT_DEBUG
{
VLOG(3) << "check result";
for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
}
}
#endif
return true;
}
......
......@@ -15,7 +15,7 @@
#include "paddle/infrt/host_context/paddle_mlir.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops_info.h"
#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
MLIRModelGenImpl::MLIRModelGenImpl()
: context_(infrt::Global::getMLIRContext()), builder_(context_) {
......
......@@ -14,22 +14,22 @@
#ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
#define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
#include <llvm/Support/CommandLine.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
#include <mlir/IR/AsmState.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/MLIRContext.h>
#include <fstream>
#include <iostream>
#include <string>
#include "llvm/Support/CommandLine.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/MLIRContext.h"
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/common/string.h"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/init_dialects.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/tensor_shape.h"
#include "paddle/infrt/paddle/model_parser.h"
......
......@@ -24,6 +24,7 @@
#include "paddle/infrt/common/shared.h"
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/host_context/function.h"
#include "paddle/infrt/host_context/symbol_table.h"
#include "paddle/infrt/support/variant.h"
#include "paddle/infrt/tensor/dense_host_tensor.h"
#include "paddle/infrt/tensor/dense_tensor_view.h"
......@@ -41,7 +42,15 @@
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#endif
#ifdef INFRT_WITH_GPU
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif // INFRT_WITH_GPU
#ifdef INFRT_WITH_TRT
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#endif // INFRT_WITH_TRT
#endif // INFRT_WITH_PHI
namespace infrt {
namespace host_context {
......@@ -72,8 +81,13 @@ using ValueVariantType =
::phi::MetaTensor,
::phi::DenseTensor,
backends::CpuPhiContext,
#ifdef INFRT_WITH_GPU
backends::GpuPhiContext,
::phi::GPUContext,
#endif
::phi::CPUContext,
std::vector<const phi::DenseTensor*>,
std::vector<phi::DenseTensor*>,
paddle::experimental::ScalarBase<phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
std::vector<phi::MetaTensor*>,
......@@ -81,6 +95,10 @@ using ValueVariantType =
paddle::experimental::Backend,
paddle::experimental::DataLayout,
paddle::experimental::DataType,
#ifdef INFRT_WITH_TRT
::infrt::backends::tensorrt::TrtEngine,
::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
#endif // INFRT_WITH_TRT
#endif
std::vector<int16_t>,
std::vector<int32_t>,
......@@ -120,8 +138,18 @@ class Value : public common::Object {
#ifdef INFRT_WITH_PHI
explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_GPU
explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
#endif
explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_TRT
explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
: data(std::move(x)) {}
explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
: data(x) {}
#endif // INFRT_WITH_TRT
#endif
template <typename T>
......
add_subdirectory(phi)
add_subdirectory(tensorrt)
core_gather_headers()
......
......@@ -25,6 +25,16 @@ namespace phi {
return ctx;
}
#ifdef INFRT_WITH_GPU
::phi::GPUContext CreateGPUContext() {
::phi::GPUContext context;
context.PartialInitWithoutAllocator();
context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
context.PartialInitWithAllocator();
return context;
}
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -25,6 +25,10 @@ namespace phi {
::phi::CPUContext CreateCPUContext();
#ifdef INFRT_WITH_GPU
::phi::GPUContext CreateGPUContext();
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -15,6 +15,12 @@
#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
#include "paddle/infrt/dialect/phi/data_type.h"
#include "paddle/infrt/kernel/phi/context_kernels.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#ifdef INFRT_WITH_GPU
#include <cuda_runtime.h>
#endif
namespace infrt {
namespace kernel {
......@@ -34,26 +40,83 @@ namespace phi {
{}));
}
::phi::DenseTensor CreateGPUDenseTensor(
const ::phi::GPUContext& context,
host_context::Attribute<std::vector<int64_t>> dims,
host_context::Attribute<std::vector<int64_t>> lod,
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision) {
return ::phi::DenseTensor(
const_cast<::phi::Allocator*>(&context.GetAllocator()),
::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
::phi::make_ddim(dims.get()),
ConvertLayoutToPhi(layout.get()),
{}));
}
void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
host_context::Attribute<std::vector<float>> value) {
auto place = ::phi::CPUPlace();
auto place = dense_tensor->place();
float* a_data = dense_tensor->mutable_data<float>(place);
if (place.GetType() == ::phi::AllocationType::CPU) {
for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
a_data[i] = (value.get())[i];
}
} else if (place.GetType() == ::phi::AllocationType::GPU) {
#ifdef INFRT_WITH_GPU
// TODO(wilber): how to set the stream parameter to copy with stream.
cudaMemcpy(a_data,
value.get().data(),
sizeof(float) * value.get().size(),
cudaMemcpyHostToDevice);
#endif
} else {
llvm_unreachable("temporarily not support other target.");
}
}
void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
#ifndef INFRT_WITH_GPU
#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \
case ::phi::DataType::PHI_DATATYPE: { \
auto place = dense_tensor->place(); \
if (place.GetType() == ::phi::AllocationType::CPU) { \
DTYPE* data = dense_tensor->data<DTYPE>(); \
if (dense_tensor->numel() == 0) break; \
std::cout << data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << data[i]; \
} \
} \
break; \
}
#else
#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \
case ::phi::DataType::PHI_DATATYPE: { \
auto place = dense_tensor->place(); \
DTYPE* data = dense_tensor->data<DTYPE>(); \
if (dense_tensor->numel() == 0) break; \
if (place.GetType() == ::phi::AllocationType::CPU) { \
std::cout << data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << data[i]; \
} \
} else if (place.GetType() == ::phi::AllocationType::GPU) { \
std::vector<DTYPE> host_data(dense_tensor->numel(), 0); \
cudaMemcpy(host_data.data(), \
data, \
sizeof(DTYPE) * dense_tensor->numel(), \
cudaMemcpyDeviceToHost); \
std::cout << host_data[0]; \
for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
std::cout << "," << host_data[i]; \
} \
} else { \
llvm_unreachable("temporarily not support other target."); \
} \
break; \
}
#endif
::phi::DDim dims = dense_tensor->dims();
std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
......
......@@ -30,6 +30,13 @@ namespace phi {
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision);
::phi::DenseTensor CreateGPUDenseTensor(
const ::phi::GPUContext& context,
host_context::Attribute<std::vector<int64_t>> dims,
host_context::Attribute<std::vector<int64_t>> lod,
host_context::Attribute<::infrt::LayoutType> layout,
host_context::Attribute<::infrt::PrecisionType> precision);
void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
host_context::Attribute<std::vector<float>> values);
void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
......
......@@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
registry->AddKernel("phi_dt.create_context.cpu",
INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
registry->AddKernelWithAttrs(
"phi_dt.create_dense_tensor",
"phi_dt.create_dense_tensor.cpu",
INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
{"dims", "lod", "layout", "precision"});
registry->AddKernelWithAttrs(
......@@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
{"value"});
registry->AddKernel("phi_dt.print_tensor",
INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
#ifdef INFRT_WITH_GPU
registry->AddKernel("phi_dt.create_context.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
registry->AddKernelWithAttrs(
"phi_dt.create_dense_tensor.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
{"dims", "lod", "layout", "precision"});
#endif
}
} // namespace kernel
......
......@@ -25,6 +25,10 @@
#include "paddle/infrt/tensor/tensor_map.h"
#include "paddle/infrt/tensor/tensor_shape.h"
#ifdef INFRT_WITH_PHI
#include "paddle/phi/core/dense_tensor.h"
#endif
namespace infrt {
namespace kernel {
using namespace host_context; // NOLINT
......@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
// TODO(wilber): Maybe we should place TensorList type in dt dialect.
#ifdef INFRT_WITH_PHI
phi::DenseTensor TensorListGetTensor(std::vector<phi::DenseTensor *> list,
Attribute<int32_t> idx) {
CHECK_LT(idx.get(), static_cast<int>(list.size()))
<< "idx should less than list size";
return *list[idx.get()];
}
int32_t TensorListGetSize(const std::vector<phi::DenseTensor *> &list) {
return list.size();
}
#endif
DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
template <typename T>
......@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
INFRT_KERNEL(TensorMapGetTensor));
registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
// TensorList related methods.
#ifdef INFRT_WITH_PHI
registry->AddKernel("dt.tensor_list_get_tensor",
INFRT_KERNEL(TensorListGetTensor));
registry->AddKernel("dt.tensor_list_get_size",
INFRT_KERNEL(TensorListGetSize));
#endif
registry->AddKernel("dt.shallow_copy_tensor",
INFRT_KERNEL(ShallowCopyTensor));
......
if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
return()
endif()
core_gather_headers()
gather_srcs(infrt_src SRCS
registry.cc
trt_kernels.cc
)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/kernel/tensorrt/registry.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/kernel_utils.h"
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
namespace infrt {
namespace kernel {
void RegisterTrtKernels(host_context::KernelRegistry* registry) {
registry->AddKernel("trt.create_engine",
INFRT_KERNEL(tensorrt::CreateTrtEngine));
registry->AddKernel("trt.inspect_engine",
INFRT_KERNEL(tensorrt::PrintTrtLayer));
registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
}
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
namespace infrt {
namespace host_context {
struct KernelRegistry;
} // namespace host_context
} // namespace infrt
namespace infrt {
namespace kernel {
/**
* Register all the trt kernels to registry.
*/
void RegisterTrtKernels(host_context::KernelRegistry* registry);
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#include <string>
#include "NvInfer.h"
#include "NvInferRuntime.h"
#include "NvInferRuntimeCommon.h"
#include "glog/logging.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/raw_ostream.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/infrt/backends/tensorrt/trt_options.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
#include "paddle/infrt/host_context/symbol_table.h"
#include "paddle/phi/core/dense_tensor.h"
namespace infrt {
namespace kernel {
namespace tensorrt {
::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
MlirOperationWithInfrtSymbol
create_engine_op /*, input_tensors, output_tensors, weights*/) {
// TODO(wilber): The device_id needs to get from mlir.
int device_id = 0;
backends::tensorrt::TrtEngine engine(device_id);
auto* builder = engine.GetTrtBuilder();
// TODO(wilber): How to process weights?
backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
// TODO(wilber): static_shape or dynamic_shape network? The code is just
// static_shape test.
network.reset(builder->createNetworkV2(0));
// TODO(wilber): The build option shoule be fiiled from mlir info.
backends::tensorrt::BuildOptions options;
options.max_batch = 4;
// Parse mlir Region which only has one block.
mlir::Operation& operation = *create_engine_op.operation;
auto* symbol_table = create_engine_op.symbol_table;
CHECK_NOTNULL(symbol_table);
unsigned int num_regions = operation.getNumRegions();
CHECK_EQ(num_regions, 1U) << "only support one region case.";
auto& region = operation.getRegion(0);
auto& block = region.getBlocks().front();
llvm::DenseMap<mlir::Value, nvinfer1::ITensor*> map_info;
std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
for (auto index_operand : llvm::enumerate(operation.getOperands())) {
mlir::Value operand = index_operand.value();
size_t idx = index_operand.index();
const std::string input_name = "input_" + std::to_string(idx);
auto* v = symbol_table->GetValue(std::to_string(idx));
CHECK_NOTNULL(v);
auto* t = &v->get<phi::DenseTensor>();
trt_bind_inputs[input_name] = t;
// TODO(wilber): get input info from mlir.
// TODO(wilber): input dims, now only support static_shape, and just remove
// the first dimension.
// TODO(wilber): now only suppot float input.
nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1;
for (int i = 0; i < dims.nbDims; ++i) {
dims.d[i] = t->dims()[i + 1];
}
auto* in =
network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
map_info[operand] = in;
}
// TODO(wilber): Find a way to add layer.
for (auto& inner_op : block.without_terminator()) {
if (inner_op.getName().getStringRef() == "trt.Activation") {
trt::ActivationOp act_op = llvm::dyn_cast<trt::ActivationOp>(inner_op);
auto in_arg = act_op.getOperand();
if (!map_info.count(in_arg)) {
CHECK(false) << "map_info not has in_arg.";
}
nvinfer1::ActivationType act_type =
static_cast<nvinfer1::ActivationType>(act_op.activation_type());
auto* act_layer = network->addActivation(*map_info[in_arg], act_type);
act_layer->setAlpha(act_op.alpha().convertToFloat());
act_layer->setBeta(act_op.beta().convertToFloat());
for (size_t i = 0; i < act_op->getNumResults(); ++i) {
nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
mlir::Value act_out = act_op->getResult(i);
map_info[act_out] = act_out_tensor;
}
}
// if (inner_op.getName().getStringRef() == "trt.Constant") {
// trt::ConstantOp op = llvm::dyn_cast<trt::ConstantOp>(inner_op);
// mlir::Value op_out = op.getResult();
// std::vector<float> weight_data{1};
// auto* layer = network->addConstant(nvinfer1::Dims2(1, 1),
// nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1});
// auto* op_out_tenor = layer->getOutput(0);
// map_info[op_out] = op_out_tenor;
// }
}
for (auto& inner_op : block.without_terminator()) {
for (mlir::Value v : inner_op.getResults()) {
for (mlir::Operation* user : v.getUsers()) {
if (user->getName().getStringRef() == "infrt.return") {
if (!map_info.count(v)) {
CHECK(false) << "map_info not has value";
}
network->markOutput(*map_info[v]);
}
}
}
}
// std::unordered_map<std::string, phi::DenseTensor*> trt_bind_outputs;
mlir::Operation* ret = block.getTerminator();
for (unsigned int i = 0; i < ret->getNumOperands(); ++i) {
mlir::Value arg = ret->getOperand(i);
CHECK(map_info.count(arg));
map_info[arg]->setName(("output_" + std::to_string(i)).c_str());
}
for (int i = 0; i < network->getNbOutputs(); ++i) {
engine.PrepareOutputHandle(network->getOutput(i)->getName());
}
VLOG(3) << "trt engine build start.";
engine.Build(std::move(network), options);
VLOG(3) << "trt engine build done.";
// TODO(wilber): get inference options from mlir.
backends::tensorrt::InferenceOptions inference_options;
inference_options.batch = 1;
// TODO(wilber): bind trt input/output tensors.
engine.SetUpInference(inference_options, trt_bind_inputs);
return engine;
}
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
engine->GetEngineInfo();
}
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
engine->Run(context);
std::vector<phi::DenseTensor*> res;
for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
res.push_back(engine->GetOutput("output_" + std::to_string(i)));
}
return res;
}
} // namespace tensorrt
} // namespace kernel
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <tuple>
#include <utility>
#include "mlir/IR/Operation.h"
#include "paddle/infrt/backends/tensorrt/trt_engine.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace infrt {
namespace host_context {
class SymbolTable;
} // namespace host_context
namespace kernel {
namespace tensorrt {
struct MlirOperationWithInfrtSymbol {
mlir::Operation* operation;
::infrt::host_context::SymbolTable* symbol_table;
};
::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
MlirOperationWithInfrtSymbol engine_op);
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
} // namespace tensorrt
} // namespace kernel
} // namespace infrt
// RUN: infrtexec -i %s | FileCheck %s
// CHECK-LABEL: @run_trt
func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
%a = "trt.create_engine"(%0) ({
%1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
"infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
}) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
"trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
%res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
%size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
"infrt.print.i32"(%size) {} : (i32) -> ()
%ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
"phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
infrt.return
}
// CHECK-LABEL: @main
func @main() {
%ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
%t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
precision=#infrt.precision<FP32>,
layout=#infrt.layout<NCHW>,
dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
"phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
//%res =
infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
//-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
infrt.return
}
// RUN: infrtopt --canonicalize %s | FileCheck %s
// RUN: infrtopt --pd-op-fuse %s | FileCheck %s
// CHECK-LABEL: @main
func @main() -> tensor<?xf32> {
%a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
......
......@@ -3,7 +3,7 @@
// CHECK-LABEL: @sign_any_float32_execute
func @sign_any_float32_execute() {
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%t = "phi_dt.create_dense_tensor" (%ctx) {
%t = "phi_dt.create_dense_tensor.cpu" (%ctx) {
precision=#infrt.precision<FP32>,
layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
......
......@@ -6,7 +6,7 @@ module {
}
func @main() {
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
%t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
......
// RUN: trt-exec %s
// CHECK-LABEL: @main
func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
%d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
%e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
func @main(%bias:!infrt.dense_tensor<GPU, FP32, NCHW>, %c:!infrt.dense_tensor<GPU, FP32, NCHW>, %b1:!infrt.dense_tensor<GPU, FP32, NCHW>, %b2:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias1:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias2:!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW> {
%d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%e = "pd.relu6"(%d) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
%d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
%e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
%c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
%d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
%e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
%c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
infrt.return %e2 : tensor<?xf32>
infrt.return %e2 : !infrt.dense_tensor<GPU, FP32, NCHW>
}
......@@ -324,7 +324,7 @@ class PADDLE_API Tensor final {
*
* @return std::shared_ptr<phi::TensorBase>
*/
std::shared_ptr<phi::TensorBase> impl() const;
const std::shared_ptr<phi::TensorBase>& impl() const;
/**
* @brief Set the implemention of current Tensor.
......@@ -333,6 +333,13 @@ class PADDLE_API Tensor final {
*/
void set_impl(const std::shared_ptr<phi::TensorBase>& impl);
/**
* @brief Set the implemention of current Tensor.
*
* @param impl
*/
void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* @brief Get the stream where the tensor is currently located
......@@ -397,7 +404,9 @@ class PADDLE_API Tensor final {
* @param blocking, Should we copy this in sync way.
* @return void
*/
void copy_(const Tensor& src, const bool blocking);
void copy_(const Tensor& src,
const phi::Place& target_place,
const bool blocking);
/**
* @brief Cast datatype from one to another
*
......
......@@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
......@@ -95,12 +95,8 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
/* ------------------ for output ----------------------- */
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
if (!out->initialized()) {
auto dense_tensor = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
phi::DenseTensorMeta());
out->set_impl(dense_tensor);
return dense_tensor.get();
if (out->impl() == nullptr) {
out->set_impl(std::make_shared<phi::DenseTensor>());
}
return static_cast<phi::DenseTensor*>(out->impl().get());
}
......@@ -111,9 +107,7 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
out->reserve(out_size);
std::vector<phi::DenseTensor*> results(out_size);
for (size_t i = 0; i < out_size; ++i) {
auto tensor_ptr = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
phi::DenseTensorMeta());
auto tensor_ptr = std::make_shared<phi::DenseTensor>();
results[i] = tensor_ptr.get();
out->emplace_back();
out->back().set_impl(tensor_ptr);
......
......@@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
if (NeedTransformPlace(
out.place(), target_args_def.backend, transform_flag)) {
phi::DenseTensor result(
phi::make_intrusive<paddle::experimental::SharedStorage>(
phi::TransToPhiPlace(target_args_def.backend)),
{out.dtype(), out.dims(), out.layout()});
phi::DenseTensor result;
framework::TransDataDevice(
out, phi::TransToPhiPlace(target_args_def.backend), &result);
out = result;
......@@ -190,14 +187,14 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
!NeedTransformLayout(
tensor_in->layout(), target_args_def.layout, transform_flag))) {
return std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in);
return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
}
phi::DenseTensor out =
TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
target_args_def,
transform_flag);
return std::make_shared<phi::DenseTensor>(out);
return std::make_shared<phi::DenseTensor>(std::move(out));
}
std::shared_ptr<phi::DenseTensor> PrepareData(
......
......@@ -46,6 +46,7 @@ limitations under the License. */
* In the future, the necessary components will be moved to the this library,
* or the corresponding components will be re-implemented.
*/
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
......@@ -142,7 +143,12 @@ PlaceType Tensor::place() const {
}
paddle::platform::Place Tensor::inner_place() const {
return ConvertExtPlaceToInnerPlace(place());
PADDLE_ENFORCE_NOT_NULL(
impl_,
phi::errors::PermissionDenied(
"Null pointer error, the impl_ of Tensor should not be "
"Null when calling Tensor::inner_place()."));
return impl_->place();
}
bool Tensor::is_cpu() const {
......@@ -286,12 +292,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
}
}
std::shared_ptr<phi::TensorBase> Tensor::impl() const { return impl_; }
const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
impl_ = impl;
}
void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
impl_ = std::move(impl);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuStream_t Tensor::stream() const {
return platform::stream::get_current_stream(-1)->raw_stream();
......
......@@ -19,9 +19,12 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_base.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace experimental {
// declare cast api
Tensor cast(const Tensor &x, DataType out_dtype);
Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
......@@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
template PADDLE_API Tensor
Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
void Tensor::copy_(const Tensor &src, bool blocking) {
void Tensor::copy_(const Tensor &src,
const phi::Place &target_place,
bool blocking) {
if (!src.is_initialized()) {
VLOG(8) << "Src is empty, skip copy";
return;
}
// Prepare copy kernel key and outputs
auto kernel_key_set = ParseKernelKeyByInputArgs(src);
KernelType kernel_type = ParseKernelTypeByInputArgs(src);
VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
if (defined()) {
if (is_initialized()) {
PADDLE_ENFORCE_EQ(dtype(),
src.dtype(),
platform::errors::PreconditionNotMet(
......@@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) {
"Copy cannot be performed!",
name(),
src.name()));
PADDLE_ENFORCE_EQ(target_place,
inner_place(),
platform::errors::PreconditionNotMet(
"Place is different of dst tensor and args %s, which "
"current tensor holds %s "
"Copy cannot be performed!",
target_place.DebugString(),
inner_place().DebugString()));
kernel_key_set.backend_set =
kernel_key_set.backend_set |
BackendSet(phi::TransToPhiBackend(inner_place()));
} else {
// Deep Copy AutoGrad info from src to self.
*autograd_meta_ = *(src.autograd_meta_);
}
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
Backend kernel_backend = Backend::UNDEFINED;
DataLayout kernel_layout = DataLayout::UNDEFINED;
DataType kernel_data_type = DataType::UNDEFINED;
if (kernel_backend == Backend::UNDEFINED ||
kernel_layout == DataLayout::UNDEFINED ||
kernel_data_type == DataType::UNDEFINED) {
if (kernel_backend == Backend::UNDEFINED) {
kernel_backend = kernel_key.backend();
}
if (kernel_layout == DataLayout::UNDEFINED) {
kernel_layout = kernel_key.layout();
}
if (kernel_data_type == DataType::UNDEFINED) {
kernel_data_type = kernel_key.dtype();
}
}
if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(6) << "copy API kernel: " << kernel;
using kernel_signature = void (*)(const platform::DeviceContext &,
const phi::DenseTensor &,
phi::Place,
bool,
phi::DenseTensor *);
SetKernelOutput(kernel_backend, this);
phi::MetaTensor meta_out(impl_.get());
phi::UnchangedInferMeta(
MakeMetaTensor(
*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
&meta_out);
auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
(*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
target_place,
blocking,
static_cast<phi::DenseTensor *>(impl_.get()));
} else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(6) << "copy API kernel: " << kernel;
using kernel_signature = void (*)(const platform::DeviceContext &,
const phi::SelectedRows &,
phi::Place,
bool,
phi::SelectedRows *);
SetSelectedRowsKernelOutput(kernel_backend, this);
phi::MetaTensor meta_out(impl_.get());
phi::UnchangedInferMeta(
MakeMetaTensor(
*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
&meta_out);
auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
(*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
target_place,
blocking,
static_cast<phi::SelectedRows *>(impl_.get()));
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"We currently only support dense tensor copy for now and if u need to "
"copy selected rows please raise a issue."));
}
auto copy_tensor =
src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
set_impl(copy_tensor.impl());
}
} // namespace experimental
......
......@@ -741,6 +741,10 @@ struct GPUContext::Impl {
GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
GPUContext::GPUContext(GPUContext&&) = default;
GPUContext& GPUContext::operator=(GPUContext&&) = default;
GPUContext::GPUContext(const GPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
......
......@@ -77,6 +77,8 @@ class DnnWorkspaceHandle {
class GPUContext : public DeviceContext {
public:
GPUContext();
GPUContext(GPUContext&&);
GPUContext& operator=(GPUContext&&);
explicit GPUContext(const GPUPlace& place);
......
cc_library(phi_place SRCS place.cc)
cc_library(scalar SRCS scalar.cc)
cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
......@@ -197,8 +197,16 @@ class Kernel {
const KernelArgsDef& args_def() const { return args_def_; }
const TensorArgDef& InputAt(size_t idx) const {
return args_def_.input_defs().at(idx);
}
TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
const TensorArgDef& OutputAt(size_t idx) const {
return args_def_.output_defs().at(idx);
}
TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
bool IsValid() { return fn_ != nullptr; }
......
......@@ -571,6 +571,48 @@ void GatherTreeMeta(const MetaTensor& ids,
out->set_dims(ids_dims);
}
void GridSampleBaseInferMeta(const MetaTensor& x,
const MetaTensor& grid,
MetaTensor* out,
MetaConfig config) {
auto x_dims = x.dims();
auto grid_dims = grid.dims();
PADDLE_ENFORCE_EQ(x_dims.size(),
4,
phi::errors::InvalidArgument(
"Input(X) of GridSampleOp should be 4-D Tensor, but "
"received X dimension size(%d)",
x_dims.size()));
PADDLE_ENFORCE_EQ(grid_dims.size(),
4,
phi::errors::InvalidArgument(
"Input(Grid) of GridSampleOp should be 4-D Tensor, "
"but received X dimension size(%d)",
grid_dims.size()));
if (config.is_runtime || grid_dims[3] > 0) {
PADDLE_ENFORCE_EQ(
grid_dims[3],
2,
phi::errors::InvalidArgument(
"Input(Grid) dimension[3] should be 2, but received %d",
grid_dims[3]));
}
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(
grid_dims[0],
x_dims[0],
phi::errors::InvalidArgument(
"Input(X) and Input(Grid) dimension[0] should be equal, but "
"received X dimension[0](%d) != Grid dimension[0](%d)",
x_dims[0],
grid_dims[0]));
}
out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
out->set_dtype(x.dtype());
out->share_lod(x);
}
void HuberLossInferMeta(const MetaTensor& input,
const MetaTensor& label,
float delta,
......
......@@ -103,6 +103,11 @@ void GatherTreeMeta(const MetaTensor& ids,
const MetaTensor& parents,
MetaTensor* out);
void GridSampleBaseInferMeta(const MetaTensor& x,
const MetaTensor& grid,
MetaTensor* out,
MetaConfig config = MetaConfig());
void HuberLossInferMeta(const MetaTensor& input_meta,
const MetaTensor& label_meta,
float delta,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
namespace phi {
namespace sr {
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst) {
if (src.value().Holder() != dst->value().Holder() ||
src.value().data() != dst->value().data()) {
dst->set_rows(src.rows());
dst->set_height(src.height());
}
phi::Copy<Context>(
dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
}
} // namespace sr
} // namespace phi
PD_REGISTER_GENERAL_KERNEL(
copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_GENERAL_KERNEL(
copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
namespace sr {
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
} // namespace sr
} // namespace phi
......@@ -565,16 +565,25 @@ def grad(outputs,
if isinstance(in_out_list, (list, tuple)):
assert len(in_out_list) > 0, "{} cannot be empty".format(name)
for each_var in in_out_list:
if core._in_eager_mode():
assert isinstance(
each_var, core.eager.
Tensor), "Elements of {} must be Tensor".format(name)
else:
assert isinstance(
each_var,
core.VarBase), "Elements of {} must be Variable".format(
name)
return in_out_list
else:
if core._in_eager_mode():
assert isinstance(
in_out_list,
core.VarBase), "{} must be Variable or list of Variable".format(
name)
in_out_list, core.eager.
Tensor), "{} must be Tensor or list of Tensor".format(name)
else:
assert isinstance(
in_out_list, core.VarBase
), "{} must be Variable or list of Variable".format(name)
return [in_out_list]
outputs = check_in_out(outputs, 'outputs')
......@@ -586,6 +595,11 @@ def grad(outputs,
for each_var in grad_outputs:
if each_var is not None:
if core._in_eager_mode():
assert isinstance(
each_var, core.eager.Tensor
), "grad_outputs must be None, a Variable or a list containing None or Variables"
else:
assert isinstance(
each_var, core.VarBase
), "grad_outputs must be None, a Variable or a list containing None or Variables"
......@@ -600,14 +614,27 @@ def grad(outputs,
no_grad_vars = []
elif isinstance(no_grad_vars, core.VarBase):
no_grad_vars = [no_grad_vars]
elif isinstance(no_grad_vars, core.eager.Tensor):
no_grad_vars = [no_grad_vars]
elif isinstance(no_grad_vars, (list, tuple, set)):
no_grad_vars = list(no_grad_vars)
for var in no_grad_vars:
if core._in_eager_mode():
assert isinstance(
var,
core.eager.Tensor), "no_grad_vars can only contains Tensor"
else:
assert isinstance(
var, core.VarBase), "no_grad_vars can only contains Variable"
var,
core.VarBase), "no_grad_vars can only contains Variable"
else:
if core._in_eager_mode():
raise AssertionError(
"no_grad_vars must be None, Variable or list/tuple/set of Variables")
"no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
else:
raise AssertionError(
"no_grad_vars must be None, Variable or list/tuple/set of Variables"
)
assert isinstance(create_graph, bool), "create_graph must be True or False"
......@@ -622,6 +649,11 @@ def grad(outputs,
assert isinstance(only_inputs, bool), "only_inputs must be True or False"
assert only_inputs, "only_inputs=False is not supported yet"
if core._in_eager_mode():
return core.eager.run_partial_grad(
outputs, inputs, grad_outputs, retain_graph, create_graph,
only_inputs, allow_unused, no_grad_vars)
place = core.Place()
place.set_place(framework._current_expected_place())
return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
......
......@@ -30,6 +30,7 @@ from paddle.fluid.layers import nn
from paddle.fluid.layers.utils import _hash_with_id
from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.framework import in_dygraph_mode
from paddle import _C_ops
__all__ = ['TranslatedLayer']
......@@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path,
return var_dict
def _valid_vars(vars):
if vars:
return vars
if framework._in_eager_mode():
return [
core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
core.VarDesc.VarType.RAW, False)
]
else:
return [
core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
core.VarDesc.VarType.RAW, False)
]
def _run_dygraph(instance, input, program_holder):
# 1. prepare inputs, outputs, attrs
......@@ -826,12 +842,7 @@ def _run_dygraph(instance, input, program_holder):
# hold forward variables
if framework._in_eager_mode():
tmp_scope_vec = core.eager.Tensor(
dtype=core.VarDesc.VarType.FP32,
dims=[],
name="program_out_scope",
type=core.VarDesc.VarType.STEP_SCOPES,
persistable=True)
tmp_scope_vec = [program_holder.scope]
else:
tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
"program_out_scope",
......@@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder):
var_desc.shape(),
var_desc.name(), var_desc.type(), False)
double_grad_vars.append(var)
if len(double_grad_vars) == 0:
if framework._in_eager_mode():
double_grad_vars = [
core.eager.Tensor(
value=[1],
name='Fake_var',
place=framework._current_expected_place())
]
else:
double_grad_vars = [
core.VarBase(
value=[1],
name='Fake_var',
place=framework._current_expected_place())
]
# 2. run program by op
trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
end_op_index = program_holder.infer_program.block(0).op_size()
framework._dygraph_tracer().trace_op(
type='run_program',
inputs={'X': input_vars,
'Params': persistable_vars},
outputs={
'Out': output_vars,
'OutScope': tmp_scope_vec,
'DOut': double_grad_vars
},
attrs={
'global_block': trace_program.block(0),
'start_op_index': 0,
'end_op_index': end_op_index,
'is_test': instance._is_test,
'program_id': _hash_with_id(trace_program, instance)
})
attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
'end_op_index', end_op_index, 'is_test', instance._is_test,
'program_id', _hash_with_id(trace_program, instance))
_C_ops.run_program(
_valid_vars(input_vars),
_valid_vars(persistable_vars),
_valid_vars(output_vars), tmp_scope_vec,
_valid_vars(double_grad_vars), *attrs)
# NOTE: [ why need set param's gradient type here ]
# if user set sparse gradient mode, the param's gradient
# will be SelectedRows, not LoDTensor. But tracer will just
......@@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder):
def drop_scope_if_no_grad(instance, scope_vec):
tracer = framework._dygraph_tracer()
scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
core.VarBase)) else scope_vec[0]
if (not instance._is_test) and (not tracer._has_grad):
scope_vec.value().get_scope().drop_kids()
scope.drop_kids()
def _run_static_graph(input, program_holder, trace_program):
......
......@@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs):
for var in flatten(input_spec):
if isinstance(var, paddle.static.InputSpec):
inner_input_spec.append(var)
elif isinstance(var, (core.VarBase, Variable)):
elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)):
inner_input_spec.append(
paddle.static.InputSpec.from_tensor(var))
else:
......
......@@ -760,7 +760,8 @@ class Layer(object):
raise KeyError("The name of buffer can not be empty.")
elif hasattr(self, name) and name not in self._buffers:
raise KeyError("attribute '{}' already exists.".format(name))
elif tensor is not None and not type(tensor) == core.VarBase:
elif tensor is not None and not (type(tensor) == core.VarBase or
type(tensor) == core.eager.Tensor):
raise TypeError(
"The registered buffer should be a core.VarBase, but received {}.".
format(type(tensor).__name__))
......
......@@ -1118,9 +1118,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150)
set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150)
set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
......
......@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
from paddle.fluid.framework import _test_eager_guard
from predictor_utils import PredictorTools
......@@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST):
np.allclose(dygraph_loss, static_loss),
msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
static_loss))
with _test_eager_guard():
dygraph_loss = self.train_dygraph()
static_loss = self.train_static()
self.assertTrue(
np.allclose(dygraph_loss, static_loss),
msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
static_loss))
def test_mnist_declarative_cpu_vs_mkldnn(self):
dygraph_loss_cpu = self.train_dygraph()
......
......@@ -19,6 +19,7 @@ import unittest
import os
import numpy as np
import random
import socket
import paddle
import paddle.nn as nn
......@@ -31,11 +32,24 @@ from paddle.optimizer import SGD
from paddle.fluid.initializer import NumpyArrayInitializer
def net_is_used(port, ip='127.0.0.1'):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.connect((ip, port))
s.shutdown(2)
return True
except Exception as e:
return False
def init_process_group(strategy=None):
nranks = ParallelEnv().nranks
rank = ParallelEnv().local_rank
is_master = True if rank == 0 else False
store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks)
for port in range(20000, 21000):
if not net_is_used(port):
store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master,
nranks)
group = core.ProcessGroupNCCL(store, rank, nranks)
return group
......
......@@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase):
out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
self.assertIsNone(data_eager.grad)
out_eager.backward(grad_eager, False)
self.assertTrue(data_eager.grad._is_initialized())
self.assertIsNotNone(data_eager.grad)
self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
def test_retain_grad_and_run_backward_raises(self):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101
import unittest
from unittest import TestCase
import numpy as np
import paddle.compat as cpt
from paddle.fluid.framework import _test_eager_guard
import paddle.fluid.core as core
def _dygraph_guard_(func):
......@@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'):
return fluid.dygraph.to_variable(x_np)
class TestEagerGrad(TestCase):
def func_simple_example_eager_grad(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
out = paddle.matmul(x, y)
dx = fluid.dygraph.grad(out, x)
dout = np.ones_like(np_y)
expected_dx = np.matmul(dout, np.transpose(np_y))
# stop_gradient = !create_graph, create_graph default false
self.assertEqual(dx[0].stop_gradient, True)
self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
def test_simple_example_eager_grad(self):
with _test_eager_guard():
self.func_simple_example_eager_grad()
self.func_simple_example_eager_grad()
def func_simple_example_eager_grad_allow_unused(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
dout = np.ones_like(np_y)
expected_dx = np.matmul(dout, np.transpose(np_y))
self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
# stop_gradient = !create_graph, create_graph default false
self.assertEqual(dx[0].stop_gradient, True)
# x is unused input in the graph
self.assertEqual(dx[1], None)
def test_simple_example_eager_grad_allow_unused(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_allow_unused()
self.func_simple_example_eager_grad_allow_unused()
def func_simple_example_eager_grad_not_allow_unused(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
try:
# allow_unused is false in default
dx = fluid.dygraph.grad(out, [x, z])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def test_simple_example_eager_grad_not_allow_unused(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_not_allow_unused()
self.func_simple_example_eager_grad_not_allow_unused()
class TestDygraphDoubleGrad(TestCase):
def setUp(self):
self.sort_sum_gradient = False
......@@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase):
allow_unused=allow_unused)
@dygraph_guard
def test_exception(self):
def func_exception(self):
with self.assertRaises(AssertionError):
self.grad(None, None)
......@@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase):
with self.assertRaises(AssertionError):
self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
def test_exception(self):
with _test_eager_guard():
self.func_exception()
self.func_exception()
@dygraph_guard
def test_simple_example(self):
def func_simple_example(self):
x = random_var(self.shape)
x.stop_gradient = False
y = x + 1
......@@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase):
self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
create_graph)
def test_simple_example(self):
with _test_eager_guard():
self.func_simple_example()
self.func_simple_example()
@dygraph_guard
def test_none_one_initial_gradient(self):
def func_example_no_grad_vars(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
x.stop_gradient = False
y1 = fluid.layers.relu(x)
y2 = fluid.layers.relu(x)
z = y1 + y2
w = z * z
w_mean = fluid.layers.reduce_mean(w)
del y1, z, w
dx_actual, = self.grad(
[w_mean], [x], create_graph=True, no_grad_vars=[y2])
self.assertFalse(y2.stop_gradient)
self.assertFalse(dx_actual.stop_gradient)
dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
def test_example_no_grad_vars(self):
with _test_eager_guard():
self.func_example_no_grad_vars()
self.func_example_no_grad_vars()
@dygraph_guard
def func_none_one_initial_gradient(self):
numel = 1
for s in self.shape:
numel *= s
......@@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase):
np.array_equal(grad_z.numpy(),
original_random_grad_z))
def test_none_one_initial_gradient(self):
with _test_eager_guard():
self.func_none_one_initial_gradient()
self.func_none_one_initial_gradient()
@dygraph_guard
def test_example_with_gradient_accumulation_and_create_graph(self):
def func_example_with_gradient_accumulation_and_create_graph(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -214,12 +337,15 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
......@@ -231,8 +357,13 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
self.func_example_with_gradient_accumulation_and_create_graph()
@dygraph_guard
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
def func_example_with_gradient_accumulation_and_no_grad_vars(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_no_grad_vars()
self.func_example_with_gradient_accumulation_and_no_grad_vars()
@dygraph_guard
def test_example_with_gradient_accumulation_and_not_create_graph(self):
def func_example_with_gradient_accumulation_and_not_create_graph(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -289,6 +428,9 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
......@@ -296,6 +438,11 @@ class TestDygraphDoubleGrad(TestCase):
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_not_create_graph()
self.func_example_with_gradient_accumulation_and_not_create_graph()
class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
def setUp(self):
......@@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
class TestDygraphDoubleGradVisitedUniq(TestCase):
def test_compare(self):
def func_compare(self):
value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
5).astype("float32")
......@@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
self.assertTrue(np.array_equal(grad_1, grad_2))
def test_compare(self):
with _test_eager_guard():
self.func_compare()
self.func_compare()
class TestRaiseNoDoubleGradOp(TestCase):
def raise_no_grad_op(self):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -18,6 +18,8 @@ import unittest
from unittest import TestCase
import numpy as np
import paddle
from paddle.fluid.framework import _test_eager_guard
import paddle.fluid.core as core
def _dygraph_guard_(func):
......@@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase):
allow_unused=allow_unused)
@dygraph_guard
def test_exception(self):
def func_exception(self):
with self.assertRaises(AssertionError):
self.grad(None, None)
......@@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase):
with self.assertRaises(AssertionError):
self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
def test_exception(self):
with _test_eager_guard():
self.func_exception()
self.func_exception()
@dygraph_guard
def test_simple_example(self):
def func_simple_example(self):
x = random_var(self.shape)
x.stop_gradient = False
y = x + 1
......@@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase):
self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
create_graph)
def test_simple_example(self):
with _test_eager_guard():
self.func_simple_example()
self.func_simple_example()
@dygraph_guard
def test_none_one_initial_gradient(self):
def func_none_one_initial_gradient(self):
numel = 1
for s in self.shape:
numel *= s
......@@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase):
np.array_equal(grad_z.numpy(),
original_random_grad_z))
def test_none_one_initial_gradient(self):
with _test_eager_guard():
self.func_none_one_initial_gradient()
self.func_none_one_initial_gradient()
@dygraph_guard
def test_example_with_gradient_accumulation_and_create_graph(self):
def func_example_with_gradient_accumulation_and_create_graph(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
self.func_example_with_gradient_accumulation_and_create_graph()
@dygraph_guard
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
def func_example_with_gradient_accumulation_and_no_grad_vars(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_no_grad_vars()
self.func_example_with_gradient_accumulation_and_no_grad_vars()
@dygraph_guard
def test_example_with_gradient_accumulation_and_not_create_graph(self):
def func_example_with_gradient_accumulation_and_not_create_graph(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
......@@ -279,6 +312,9 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
......@@ -286,6 +322,11 @@ class TestDygraphDoubleGrad(TestCase):
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_not_create_graph()
self.func_example_with_gradient_accumulation_and_not_create_graph()
class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
def setUp(self):
......
......@@ -193,7 +193,7 @@ class InputSpec(object):
print(x_spec) # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
"""
if isinstance(tensor, (Variable, core.VarBase)):
if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
return cls(tensor.shape, tensor.dtype, name or tensor.name)
else:
raise ValueError(
......
......@@ -698,7 +698,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
return f"""
{code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
{code_indent} const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
{code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
{code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
{code_indent} VLOG(6) << "{self.api} API kernel: " << kernel;
......
......@@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
}
def PD_ReturnOp : PD_Op<"return", [Terminator]> {
let summary = "return Op";
let description = [{
Fetch tensor from the graph.
}];
let arguments = (ins Variadic<PD_Tensor>:$inputs);
}
def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
let summary = "paddle graph Op";
let description = [{
......
......@@ -16,8 +16,6 @@ import paddle.fluid.framework as framework
from paddle.fluid import core
from paddle import compat as cpt
ops_having_canonicalization = {"elementwise_add", }
# collect original ops: op which has both inference and grid defination
def get_original_ops():
......@@ -186,7 +184,7 @@ def generate_all_ops_inputs_outputs_map(op_descs):
cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};"
# 3. Write to header file
dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h"
dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h"
with open(dst_head_file, 'w') as ops_inputs_outputs_head_file:
ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str)
ops_inputs_outputs_head_file.write("\n\n")
......@@ -195,7 +193,7 @@ def generate_all_ops_inputs_outputs_map(op_descs):
# funtion to generate paddle op dialect file
def convert_op_proto_into_mlir(op_descs):
dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td"
dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td"
custom_dialect_file = "custom_pdop.td"
# 1. Head files
......@@ -214,7 +212,7 @@ def convert_op_proto_into_mlir(op_descs):
"include \"mlir/Interfaces/InferTypeOpInterface.td\"",
"include \"mlir/Interfaces/LoopLikeInterface.td\"",
"include \"mlir/IR/OpBase.td\"",
"include \"paddle/infrt/dialect/pd_op_base.td\"",
"include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"",
"",
]
......@@ -245,7 +243,6 @@ def convert_op_proto_into_mlir(op_descs):
op_type=op_type,
left_brace="{")
SUMMARY = ' let summary = "{} op";\n'.format(op_type)
CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else ""
# 2.2 Description
contents = ""
......@@ -348,7 +345,6 @@ def convert_op_proto_into_mlir(op_descs):
ops_mlir_file.write(DESCRIPTION)
ops_mlir_file.write(ARGUMENTS)
ops_mlir_file.write(RESULTS)
ops_mlir_file.write(CANONICALIZATION)
ops_mlir_file.write("}\n")
print("Skipped ops num: " + str(len(skipped_op_list)))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册