提交 a4586d17 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -29,132 +29,330 @@ ...@@ -29,132 +29,330 @@
namespace egr { namespace egr {
std::unordered_map<GradNodeBase*, int> getInDegreeMap( /*
const std::queue<GradNodeBase*>& init_queue) { * GeneralGrad is Helpper class to implement custom grad operation between
// Calculate in_degree for each node * outputs and inputs.
// We can completely remove this pass, if in_degree were set during forward *
// pass * **/
std::unordered_map<GradNodeBase*, int> node_in_degree_map; class GeneralGrad {
public:
static GeneralGrad& Instance() { return *general_grad_; }
// Get inputs's / no_grad_vars's GradNodes and InputMeta Info
void GetTargetNodesInfo(
const std::vector<paddle::experimental::Tensor>& inputs,
bool is_no_grad_vars) {
std::string msg = is_no_grad_vars ? "no_grad_vars" : "inputs";
VLOG(6) << "Running in GetTargetNodesInfo.";
if (!inputs.empty()) {
VLOG(6) << msg << " are not empty.";
size_t num_inputs = inputs.size();
for (size_t i = 0; i < num_inputs; i++) {
AutogradMeta* auto_grad_meta =
EagerUtils::unsafe_autograd_meta(inputs[i]);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
PADDLE_ENFORCE_NOT_NULL(target_node,
paddle::platform::errors::Fatal(
"There is no grad op for %s:[%d] or it's"
"stop_gradient=True.",
msg, i));
if (is_no_grad_vars) {
(no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
} else { // normal input
(input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
}
}
}
}
// Copy nodes // Purify potential_startup_nodes, remove nodes those are the same as
std::queue<GradNodeBase*> queue = init_queue; // input_target_nodes
std::unordered_set<GradNodeBase*> visited; void PurifyPotentialStartUpNodes() {
size_t potential_startup_ops_cnt = queue.size(); VLOG(6) << "Running in PurifyPotentialStartUpNodes";
size_t cnt = 0; if (input_target_nodes_inputmeta_map.empty()) return;
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto startup_op : potential_startup_nodes) {
auto iter = input_target_nodes_inputmeta_map.find(startup_op);
if (iter != input_target_nodes_inputmeta_map.end()) {
potential_startup_nodes_to_be_erased.emplace(iter->first);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto nodes : potential_startup_nodes_to_be_erased) {
potential_startup_nodes.erase(nodes);
}
}
}
// Visit each node exactly once in any order // Remove some nodes those doesn't need to be
while (!queue.empty()) { // stored in potential_stop_nodes、potential_startup_nodes
GradNodeBase* node = queue.front(); void UpdateGraphInfo() {
queue.pop(); // Updated potential_sotp_nodes by depending_nodes,
// make sure the path from root to target_node is ok
std::unordered_set<GradNodeBase*> _startup_ops;
VLOG(6) << "Running in UpdateGraphInfo";
std::queue<GradNodeBase*> queue;
for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) {
queue.emplace(target_nodes_inputmeta_pair.first);
}
if (cnt < potential_startup_ops_cnt) { while (!queue.empty()) {
if (!node_in_degree_map.count(node)) { auto* target_node = queue.front();
node_in_degree_map[node] = 0; queue.pop();
if (!(depending_nodes)[target_node].empty()) {
auto precedding_nodes = (depending_nodes)[target_node];
for (auto pre_nodes : precedding_nodes) {
queue.emplace(pre_nodes);
if (potential_stop_nodes.find(pre_nodes) !=
potential_stop_nodes.end()) {
potential_stop_nodes.erase(pre_nodes);
}
}
} else { // startup_ops have no precedding nodes
VLOG(6) << "Emplace _startup_ops";
_startup_ops.emplace(target_node);
} }
cnt += 1;
} }
// Purify potential_startup_nodes again, remove some
if (visited.count(node)) { // potential startup_nodes that unreach to input target nodes
continue; if (!_startup_ops.empty()) {
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto node : potential_startup_nodes) {
if (_startup_ops.count(node) == 0) {
VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
potential_startup_nodes_to_be_erased.emplace(node);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto node : potential_startup_nodes_to_be_erased) {
VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
potential_startup_nodes.erase(node);
}
}
} }
visited.insert(node); }
PADDLE_ENFORCE_NOT_NULL( // Get Graph Info Betweent input target GradNode and outputs,
node, // record depending_nodes、potential_stop_nodes、potential_startup_nodes
paddle::platform::errors::Fatal( void GetGraphInfoBetweenTargets(const std::queue<GradNodeBase*>& init_queue) {
"We got null node when we traverse the backward graph, and this " VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
"should not happened please check your code and contact us."));
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
if (!next_node) continue;
// Update in_degree // Calculate in_degree for each node
if (!node_in_degree_map.count(next_node)) std::unordered_map<GradNodeBase*, int> node_in_degree_map;
node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++; // Copy nodes
queue.push(next_node); std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (visited.count(node)) {
continue;
}
visited.insert(node);
// Check node is target_nodes or not, if node is not target_node,
// all the next_node will be marked in potential_stop_nodes
bool is_potential_stop_nodes =
input_target_nodes_inputmeta_map.count(node);
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
if (!next_node) continue;
// if node not in input_target_nodes,
// all the next_nodes of current node will be inserted to
// potential_stop_node
if (is_potential_stop_nodes) {
potential_stop_nodes.emplace(next_node);
}
// Update in_degree
if (!node_in_degree_map.count(next_node))
node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++;
// Record depending relationship
(depending_nodes)[next_node].emplace(node);
queue.push(next_node);
}
} }
} }
// Update Graph Info, remove some nodes in
// potential_stop_nodes、potential_startup_nodes、
UpdateGraphInfo();
} }
return node_in_degree_map;
}
// Remove some nodes those doesn't need to be void ModifyReadyQueue(std::queue<GradNodeBase*>* queue) {
// stored in potential_stop_nodes、potential_startup_nodes std::queue<GradNodeBase*> tmp_queue;
void UpdateGraphInfo( for (auto nodes : potential_startup_nodes) {
std::unordered_map<GradNodeBase*, AutogradMeta*>* tmp_queue.emplace(nodes);
target_nodes_inputmeta_map, }
std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>* tmp_queue.swap(*queue);
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
// Updated potential_sotp_nodes by depending_nodes,
// make sure the path from root to target_node is ok
std::unordered_set<GradNodeBase*> _startup_ops;
VLOG(6) << "Running in UpdateGraphInfo";
std::queue<GradNodeBase*> queue;
for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
queue.emplace(target_nodes_inputmeta_pair.first);
} }
while (!queue.empty()) { // Set result for input target grad_var when potential_startup_nodes is empty
auto* target_node = queue.front(); void SetResultForInputTargetVar(
queue.pop(); const std::unordered_map<GradNodeBase*,
if (!(*depending_nodes)[target_node].empty()) { std::unique_ptr<GradTensorHolder>>&
auto precedding_nodes = (*depending_nodes)[target_node]; node_input_buffers_dict) {
for (auto pre_nodes : precedding_nodes) { if (potential_startup_nodes.size() == 0) {
queue.emplace(pre_nodes); for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) {
if (potential_stop_nodes->find(pre_nodes) != // out rank_info of forward op
potential_stop_nodes->end()) { auto rank_info = input_target_node.second->OutRankInfo();
potential_stop_nodes->erase(pre_nodes); auto iter = node_input_buffers_dict.find(input_target_node.first);
if (iter != node_input_buffers_dict.end()) {
auto& target_result =
(iter->second)->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[input_target_node.first] = target_result;
} }
} }
} else { // startup_ops have no precedding nodes
VLOG(6) << "Emplace _startup_ops";
_startup_ops.emplace(target_node);
} }
} }
// Purify potential_startup_nodes again, remove some
// potential startup_nodes that unreach to input target nodes // Set input target grad_var from node_input_buffer by inputmeta
if (!_startup_ops.empty()) { void SetResultForInputTargetVar(GradTensorHolder input_buffers,
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased; GradNodeBase* node) {
for (auto node : *potential_startup_nodes) { auto iter = GetInPutTargetNodesInputMetaMap()->find(node);
if (_startup_ops.count(node) == 0) { if (iter != GetInPutTargetNodesInputMetaMap()->end()) {
VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; VLOG(6) << "Get target result by by inputmeta";
potential_startup_nodes_to_be_erased.emplace(node); // out rank_info of forward op
} auto rank_info = (iter->second)->OutRankInfo();
// rank_info is a pair, first means slot_id, second means rank.
auto& target_result =
input_buffers.Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[node] = target_result;
} }
if (!potential_startup_nodes_to_be_erased.empty()) { }
for (auto node : potential_startup_nodes_to_be_erased) {
VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; std::vector<paddle::experimental::Tensor> GetResults(
potential_startup_nodes->erase(node); const std::vector<paddle::experimental::Tensor>& inputs,
bool allow_unused, bool create_graph) {
VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {};
std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
auto iter = results_map.find(target_node);
if (iter != results_map.end()) {
// set StopGradient = !create_graph
AutogradMeta* tensor_auto_grad_meta =
EagerUtils::autograd_meta(&(iter->second));
tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input tensor or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
} }
} }
Clear();
return results;
}
void PreparedForGeneralGrad(
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& no_grad_vars,
std::queue<GradNodeBase*>* queue,
const std::unordered_map<GradNodeBase*,
std::unique_ptr<GradTensorHolder>>&
node_input_buffers_dict) {
// Get no_grad_vars's GradNodes and InputMeta Info
GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
// Get inputs's GradNodes and InputMeta Info
GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
// Purify potential_startup_ops, remove those nodes that are the same as
// input_target_nodes
PurifyPotentialStartUpNodes();
// Get Graph Info Betweent input target gradnode and outputs
// Record the depending_nodes and
// potential_stop_nodes、potential_startup_nodes
GetGraphInfoBetweenTargets(*queue);
// Reset queue. Queue is empty only when
// 1.input equals to output. 2.input can not reach to output.
ModifyReadyQueue(queue);
// Set result for input target grad_var when queue is empty
if (queue->empty()) SetResultForInputTargetVar(node_input_buffers_dict);
}
bool IsPotentialStopNodes(GradNodeBase* node) {
return potential_stop_nodes.count(node);
}
std::unordered_map<GradNodeBase*, AutogradMeta*>*
GetNoGradVarNodesInputMetaMap() {
return &no_grad_var_nodes_inputmeta_map;
}
std::unordered_map<GradNodeBase*, AutogradMeta*>*
GetInPutTargetNodesInputMetaMap() {
return &input_target_nodes_inputmeta_map;
}
std::unordered_set<GradNodeBase*>* GetPotentialStopNodes() {
return &potential_stop_nodes;
}
std::unordered_set<GradNodeBase*>* GetPotentialStartupNodes() {
return &potential_startup_nodes;
}
void Clear() {
no_grad_var_nodes_inputmeta_map.clear();
input_target_nodes_inputmeta_map.clear();
potential_startup_nodes.clear();
potential_stop_nodes.clear();
depending_nodes.clear();
results_map.clear();
} }
}
// Get Graph Info Betweent input target gradnode and outputs, private:
// record depending_nodes、 potential_stop_nodes、potential_startup_nodes GeneralGrad() = default;
void GetGraphInfoBetweenTargets( static GeneralGrad* general_grad_;
const std::queue<GradNodeBase*>& init_queue, // no_grad_vars's GradNode and GradNode's InputMeta.
std::unordered_map<GradNodeBase*, AutogradMeta*>* std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
input_target_nodes_inputmeta_map, no_grad_var_nodes_inputmeta_map;
std::unordered_map</*child node*/ GradNodeBase*, // inputs's GradNode and GradNode's InputMeta.
/*father nodes*/ std::unordered_set<GradNodeBase*>>* std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
depending_nodes, input_target_nodes_inputmeta_map;
std::unordered_set<GradNodeBase*>* potential_stop_nodes, // Record all the potential startup_nodes, will be changed.
std::unordered_set<GradNodeBase*>* potential_startup_nodes) { std::unordered_set<GradNodeBase*> potential_startup_nodes;
if (input_target_nodes_inputmeta_map->empty()) return; // Record all the potential stop nodes, will be changed.
std::unordered_set<GradNodeBase*> potential_stop_nodes;
VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; std::unordered_map<GradNodeBase* /* next node */,
std::unordered_set<GradNodeBase*> /* pre nodes */>
depending_nodes;
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
DISABLE_COPY_AND_ASSIGN(GeneralGrad);
};
std::unordered_map<GradNodeBase*, int> getInDegreeMap(
const std::queue<GradNodeBase*>& init_queue) {
// Calculate in_degree for each node // Calculate in_degree for each node
// We can completely remove this pass, if in_degree were set during forward
// pass
std::unordered_map<GradNodeBase*, int> node_in_degree_map; std::unordered_map<GradNodeBase*, int> node_in_degree_map;
// Copy nodes // Copy nodes
...@@ -171,101 +369,30 @@ void GetGraphInfoBetweenTargets( ...@@ -171,101 +369,30 @@ void GetGraphInfoBetweenTargets(
} }
visited.insert(node); visited.insert(node);
// Check node is target_nodes or not, if node is not target_node, PADDLE_ENFORCE_NOT_NULL(
// all the next_node will be marked in potential_stop_nodes node,
bool is_potential_stop_nodes = paddle::platform::errors::Fatal(
input_target_nodes_inputmeta_map->count(node); "We got null node when we traverse the backward graph, and this "
"should not happened please check your code and contact us."));
// Find and append next nodes // Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges(); const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) { for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) { for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get(); GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no // Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached // AccumulationNode attached
// Or it could also originated from dispensable inputs // Or it could also originated from dispensable inputs
if (!next_node) continue; if (!next_node) continue;
// if node not in input_target_nodes,
// all the next_nodes of current node will be inserted to
// potential_stop_node
if (is_potential_stop_nodes) {
potential_stop_nodes->emplace(next_node);
}
// Update in_degree // Update in_degree
if (!node_in_degree_map.count(next_node)) if (!node_in_degree_map.count(next_node))
node_in_degree_map[next_node] = 0; node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++; node_in_degree_map[next_node]++;
// Record depending relationship
(*depending_nodes)[next_node].emplace(node);
queue.push(next_node); queue.push(next_node);
} }
} }
} }
// Update Graph Info, remove some stop_node in potential_stop_nodes return node_in_degree_map;
UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
potential_stop_nodes, potential_startup_nodes);
}
void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map) {
VLOG(6) << "Running in GetTargetNodesInfo";
if (!inputs.empty()) {
VLOG(6) << "Inputs are not empty";
size_t num_inputs = inputs.size();
for (size_t i = 0; i < num_inputs; i++) {
AutogradMeta* auto_grad_meta =
EagerUtils::unsafe_autograd_meta(inputs[i]);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
PADDLE_ENFORCE_NOT_NULL(target_node,
paddle::platform::errors::Fatal(
"There is no grad op for input:%d or it's"
"stop_gradient=True",
i));
(*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
}
}
}
std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
results_map,
bool allow_unused, bool create_graph) {
VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {};
std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
auto iter = results_map->find(target_node);
if (iter != results_map->end()) {
// set StopGradient = !create_graph
AutogradMeta* tensor_auto_grad_meta =
EagerUtils::autograd_meta(&(iter->second));
tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input variable or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
}
}
return results;
} }
// Enforce GradNode has TensorWrappers as Input // Enforce GradNode has TensorWrappers as Input
...@@ -281,28 +408,23 @@ void EnforceGradNodeHasInput(GradNodeBase* node) { ...@@ -281,28 +408,23 @@ void EnforceGradNodeHasInput(GradNodeBase* node) {
node->name())); node->name()));
} }
// Purify potential_startup_nodes, remove nodes those are the same as void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
// input_target_nodes bool is_input) {
void PurifyPotentialStartUpNodes( std::unordered_set<AutogradMeta*> visisted_ins;
std::unordered_set<GradNodeBase*>* potential_startup_nodes, std::string msg = is_input ? "inputs" : "outputs";
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>* for (auto in : inputs) {
input_target_nodes_inputmeta_map) { AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
VLOG(6) << "Running in PurifyPotentialStartUpNodes"; PADDLE_ENFORCE_EQ(
if (input_target_nodes_inputmeta_map->empty()) return; visisted_ins.count(auto_grad_meta), 0,
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased; paddle::platform::errors::AlreadyExists(
for (auto startup_op : *potential_startup_nodes) { "%s contain duplicate tensor %s, please check %s carefully.", msg,
auto iter = input_target_nodes_inputmeta_map->find(startup_op); in.name(), msg));
if (iter != input_target_nodes_inputmeta_map->end()) { visisted_ins.insert(auto_grad_meta);
potential_startup_nodes_to_be_erased.emplace(iter->first);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto nodes : potential_startup_nodes_to_be_erased) {
potential_startup_nodes->erase(nodes);
}
} }
} }
GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
std::vector<paddle::experimental::Tensor> RunBackward( std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
...@@ -315,10 +437,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -315,10 +437,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
// *Inplace version check should perform at node-level // *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass // *Cross-batch accumulation happens at forward pass
std::unordered_map<GradNodeBase*, AutogradMeta*> // GeneralGrad
no_grad_var_nodes_inputmeta_map; bool is_general_grad = !inputs.empty();
// Get no_grad_vars's GradNodes and InputMeta Info
GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
/* --- Initialization --- */ /* --- Initialization --- */
// 1. Init queue with starting nodes // 1. Init queue with starting nodes
...@@ -326,7 +446,6 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -326,7 +446,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
std::queue<GradNodeBase*> queue; std::queue<GradNodeBase*> queue;
std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>> std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
node_input_buffers_dict; node_input_buffers_dict;
std::unordered_set<GradNodeBase*> potential_startup_nodes;
for (size_t i = 0; i < tensors.size(); i++) { for (size_t i = 0; i < tensors.size(); i++) {
const paddle::experimental::Tensor& tensor = tensors[i]; const paddle::experimental::Tensor& tensor = tensors[i];
...@@ -363,7 +482,7 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -363,7 +482,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Detected size mismatch between tensors and grad_tensors" "Detected size mismatch between tensors and grad_tensors"
"grad_tensors should either have " "grad_tensors should either have "
"size = 0 or same size as tensors")); "size = 0 or same size as tensors."));
// Feed given tensor if it's provided // Feed given tensor if it's provided
VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
...@@ -391,7 +510,9 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -391,7 +510,9 @@ std::vector<paddle::experimental::Tensor> RunBackward(
// Prepare queue, potential startup_nodes // Prepare queue, potential startup_nodes
queue.push(grad_node); queue.push(grad_node);
potential_startup_nodes.emplace(grad_node); if (is_general_grad) {
GeneralGrad::Instance().GetPotentialStartupNodes()->emplace(grad_node);
}
} }
VLOG(6) << "Update In degree Map for backward"; VLOG(6) << "Update In degree Map for backward";
...@@ -399,56 +520,13 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -399,56 +520,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
std::unordered_map<GradNodeBase*, int> node_in_degree_map = std::unordered_map<GradNodeBase*, int> node_in_degree_map =
getInDegreeMap(queue); getInDegreeMap(queue);
// Get input's GradNodes and InputMeta Info if (is_general_grad) {
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */> // Prepare several vital preprocess for GeneralGrad
input_target_nodes_inputmeta_map; GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue,
GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map); node_input_buffers_dict);
// Purify potential_startup_ops, remove those nodes that are the same as
// input_target_nodes
PurifyPotentialStartUpNodes(&potential_startup_nodes,
&input_target_nodes_inputmeta_map);
// Get Graph Info Betweent input target gradnode and outputs
// Record the depending_nodes and potential_stop_nodes
std::unordered_map<GradNodeBase* /* child node */,
std::unordered_set<GradNodeBase*> /* father node */>
depending_nodes;
std::unordered_set<GradNodeBase*> potential_stop_nodes;
// std::unordered_set<GradNodeBase*> startup_ops;
GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
&depending_nodes, &potential_stop_nodes,
&potential_startup_nodes);
// ready_queue store all startup nodes
std::queue<GradNodeBase*> ready_queue;
// startup op's indegree should be 0
for (auto node : potential_startup_nodes) {
if (node_in_degree_map[node] == 0) {
ready_queue.emplace(node);
}
} }
VLOG(1) << " startup_ops' size is :" << ready_queue.size(); VLOG(6) << " startup_ops' size is :" << queue.size();
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
// read_queue is empty only when 1.input equals to output. 2.input can not
// reach to output.
if (ready_queue.size() == 0) {
for (auto input_target_node : input_target_nodes_inputmeta_map) {
// out rank_info of forward op
auto rank_info = input_target_node.second->OutRankInfo();
if (node_input_buffers_dict[input_target_node.first]) {
auto& target_result =
node_input_buffers_dict[input_target_node.first]
->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[input_target_node.first] = target_result;
}
}
}
/* --- Topological Visit --- */ /* --- Topological Visit --- */
// 1. Pop queue // 1. Pop queue
...@@ -458,53 +536,55 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -458,53 +536,55 @@ std::vector<paddle::experimental::Tensor> RunBackward(
// |- Prepare for next node // |- Prepare for next node
// 3. Update queue // 3. Update queue
VLOG(6) << "Run Backward"; VLOG(6) << "Run Backward";
while (!ready_queue.empty()) { while (!queue.empty()) {
GradNodeBase* node = ready_queue.front(); GradNodeBase* node = queue.front();
VLOG(6) << "Running GradNode:" << node->name(); VLOG(6) << "Running GradNode:" << node->name();
ready_queue.pop();
paddle::platform::RecordEvent node_record_event( paddle::platform::RecordEvent node_record_event(
std::string(typeid(*node).name()) + " grad_node", std::string(typeid(*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1); paddle::platform::TracerEventType::Operator, 1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop();
continue;
}
queue.pop();
// Run node: This is where Hook happens // Run node: This is where Hook happens
PADDLE_ENFORCE( PADDLE_ENFORCE(
node_input_buffers_dict.count(node), node_input_buffers_dict.count(node),
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Unable to find next node in the GradTensorHolder \n" "Unable to find next node in the GradTensorHolder \n"
"Trying to run Node without configuring its GradTensorHolder")); "Trying to run Node without configuring its GradTensorHolder."));
std::unique_ptr<GradTensorHolder> node_input_buffer = std::unique_ptr<GradTensorHolder> node_input_buffer =
std::move(node_input_buffers_dict[node]); std::move(node_input_buffers_dict[node]);
// get target grad_var from node_input_buffer by inputmeta // Set input target grad_var from node_input_buffer by inputmeta
if (input_target_nodes_inputmeta_map.find(node) != if (!inputs.empty() && is_general_grad) {
input_target_nodes_inputmeta_map.end()) { GeneralGrad::Instance().SetResultForInputTargetVar(*node_input_buffer,
VLOG(6) << "Get target result by by inputmeta"; node);
// out rank_info of forward op
auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
// rank_info is a pair, first means slot_id, second means rank.
auto& target_result =
node_input_buffer->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[node] = target_result;
} }
// no_grad_vars // no_grad_vars
if (no_grad_var_nodes_inputmeta_map.find(node) != if (!no_grad_vars.empty() && is_general_grad) {
no_grad_var_nodes_inputmeta_map.end()) { auto iter =
VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node);
auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo(); if (iter !=
node_input_buffer->SetBufferSlotRankZeros(rank_info.first, GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) {
rank_info.second); VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
auto rank_info = (iter->second)->OutRankInfo();
node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
rank_info.second);
}
} }
VLOG(6) << "Running GradNode:" << node->name(); VLOG(6) << "Running GradNode:" << node->name();
// check input // Check input
EnforceGradNodeHasInput(node); EnforceGradNodeHasInput(node);
VLOG(6) << "Run Backward Kernel with GradTensorHolder"; VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
// Run Pre Backward Node and get outputs // Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors = std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers(), create_graph); (*node)(node_input_buffer->Buffers(), create_graph);
...@@ -587,23 +667,29 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -587,23 +667,29 @@ std::vector<paddle::experimental::Tensor> RunBackward(
node_in_degree_map[next_node] >= 0, node_in_degree_map[next_node] >= 0,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Detected in-degree value smaller than zero. For Node: %s" "Detected in-degree value smaller than zero. For Node: %s"
"Node's in-degree cannot be negative", "Node's in-degree cannot be negative.",
next_node->name())); next_node->name()));
bool is_potential_stop_node = potential_stop_nodes.count(next_node); if (is_general_grad) {
bool is_potential_stop_node =
if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node);
ready_queue.emplace(std::move(next_node)); if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
queue.emplace(std::move(next_node));
}
} else {
if (node_in_degree_map[next_node] == 0) {
queue.emplace(std::move(next_node));
}
} }
} }
} }
} }
if (!is_general_grad) return {};
return GetResults(inputs, &results_map, allow_unused, create_graph); return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
} }
void Backward( void Backward(
const std::vector<paddle::experimental::Tensor>& tensors, // output const std::vector<paddle::experimental::Tensor>& tensors, // outputs
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) { bool retain_graph) {
VLOG(6) << "Run in Backward"; VLOG(6) << "Run in Backward";
...@@ -613,12 +699,16 @@ void Backward( ...@@ -613,12 +699,16 @@ void Backward(
} }
std::vector<paddle::experimental::Tensor> Grad( std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // output const std::vector<paddle::experimental::Tensor>& tensors, // outputs
const std::vector<paddle::experimental::Tensor>& inputs, const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) { const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad"; VLOG(6) << "Run in Grad";
DuplicateCheck(inputs, true /* is_input */);
DuplicateCheck(tensors, false /* is_input */);
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
allow_unused, no_grad_vars); allow_unused, no_grad_vars);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <thrust/host_vector.h>
#include "heter_comm.h" #include "heter_comm.h"
#include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
...@@ -40,11 +41,13 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> { ...@@ -40,11 +41,13 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
int sample_size, int len); int sample_size, int len);
NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
void clear_graph_info(); void clear_graph_info();
void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, void move_neighbor_sample_result_to_source_gpu(
int sample_size, int *h_left, int gpu_id, int gpu_num, int *h_left, int *h_right,
int *h_right, int64_t *src_sample_res, thrust::host_vector<int> &total_sample_size);
int64_t *src_sample_res, void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num,
int *actual_sample_size); int *h_left, int *h_right,
int *actual_sample_size,
int *total_sample_size);
int init_cpu_table(const paddle::distributed::GraphParameter &graph); int init_cpu_table(const paddle::distributed::GraphParameter &graph);
int load(const std::string &path, const std::string &param); int load(const std::string &path, const std::string &param);
virtual int32_t end_graph_sampling() { virtual int32_t end_graph_sampling() {
......
...@@ -13,10 +13,23 @@ ...@@ -13,10 +13,23 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/transform.h>
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
constexpr int WARP_SIZE = 32;
/* /*
comment 0 comment 0
this kernel just serves as an example of how to sample nodes' neighbors. this kernel just serves as an example of how to sample nodes' neighbors.
...@@ -29,20 +42,79 @@ sample_size; ...@@ -29,20 +42,79 @@ sample_size;
*/ */
__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, struct MaxFunctor {
int* actual_size, int sample_size;
int64_t* sample_result, int sample_size, HOSTDEVICE explicit inline MaxFunctor(int sample_size) {
int len) { this->sample_size = sample_size;
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; }
if (i < len) { HOSTDEVICE inline int operator()(int x) const {
if (x > sample_size) {
return sample_size;
}
return x;
}
};
struct DegreeFunctor {
GpuPsCommGraph graph;
HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) {
this->graph = graph;
}
HOSTDEVICE inline int operator()(int i) const {
return graph.node_list[i].neighbor_size;
}
};
template <int BLOCK_WARPS, int TILE_SIZE>
__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph,
int sample_size, int* index, int len,
int64_t* sample_result, int* output_idx,
int* output_offset) {
assert(blockDim.x == WARP_SIZE);
assert(blockDim.y == BLOCK_WARPS);
int i = blockIdx.x * TILE_SIZE + threadIdx.y;
const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
curandState rng;
curand_init(rand_seed * gridDim.x + blockIdx.x,
threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
while (i < last_idx) {
auto node_index = index[i]; auto node_index = index[i];
actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size int degree = graph.node_list[node_index].neighbor_size;
? graph.node_list[node_index].neighbor_size const int offset = graph.node_list[node_index].neighbor_offset;
: sample_size; int output_start = output_offset[i];
int offset = graph.node_list[node_index].neighbor_offset;
for (int j = 0; j < actual_size[i]; j++) { if (degree <= sample_size) {
sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; // Just copy
for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
sample_result[output_start + j] = graph.neighbor_list[offset + j];
}
} else {
for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
output_idx[output_start + j] = j;
}
__syncwarp();
for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) {
const int num = curand(&rng) % (j + 1);
if (num < sample_size) {
atomicMax(
reinterpret_cast<unsigned int*>(output_idx + output_start + num),
static_cast<unsigned int>(j));
}
}
__syncwarp();
for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
const int perm_idx = output_idx[output_start + j] + offset;
sample_result[output_start + j] = graph.neighbor_list[perm_idx];
}
} }
i += BLOCK_WARPS;
} }
} }
...@@ -79,7 +151,7 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) { ...@@ -79,7 +151,7 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
gpu i triggers a neighbor_sample task, gpu i triggers a neighbor_sample task,
when this task is done, when this task is done,
this function is called to move the sample result on other gpu back this function is called to move the sample result on other gpu back
to gup i and aggragate the result. to gpu i and aggragate the result.
the sample_result is saved on src_sample_res and the actual sample size for the sample_result is saved on src_sample_res and the actual sample size for
each node is saved on actual_sample_size. each node is saved on actual_sample_size.
the number of actual sample_result for the number of actual sample_result for
...@@ -96,10 +168,50 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) { ...@@ -96,10 +168,50 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
that's what fill_dvals does. that's what fill_dvals does.
*/ */
void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu(
int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size,
int* total_sample_size) {
// This function copyed actual_sample_size to source_gpu,
// and calculate total_sample_size of each gpu sample number.
for (int i = 0; i < gpu_num; i++) {
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
auto shard_len = h_right[i] - h_left[i] + 1;
auto& node = path_[gpu_id][i].nodes_.front();
cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
node.val_storage + sizeof(int) * shard_len,
sizeof(int) * shard_len, cudaMemcpyDefault,
node.out_stream);
}
for (int i = 0; i < gpu_num; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) {
total_sample_size[i] = 0;
continue;
}
auto& node = path_[gpu_id][i].nodes_.front();
cudaStreamSynchronize(node.out_stream);
auto shard_len = h_right[i] - h_left[i] + 1;
thrust::device_vector<int> t_actual_sample_size(shard_len);
thrust::copy(actual_sample_size + h_left[i],
actual_sample_size + h_left[i] + shard_len,
t_actual_sample_size.begin());
total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(),
t_actual_sample_size.end());
}
}
void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res,
int64_t* src_sample_res, int* actual_sample_size) { thrust::host_vector<int>& total_sample_size) {
/*
if total_sample_size is [4, 5, 1, 6],
then cumsum_total_sample_size is [0, 4, 9, 10];
*/
thrust::host_vector<int> cumsum_total_sample_size(gpu_num, 0);
thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(),
cumsum_total_sample_size.begin(), 0);
for (int i = 0; i < gpu_num; i++) { for (int i = 0; i < gpu_num; i++) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
...@@ -109,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( ...@@ -109,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
// auto& node = path_[gpu_id][i].nodes_[cur_step]; // auto& node = path_[gpu_id][i].nodes_[cur_step];
auto& node = path_[gpu_id][i].nodes_.front(); auto& node = path_[gpu_id][i].nodes_.front();
cudaMemcpyAsync( cudaMemcpyAsync(
reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size), reinterpret_cast<char*>(src_sample_res + cumsum_total_sample_size[i]),
node.val_storage + sizeof(int64_t) * shard_len, node.val_storage + sizeof(int64_t) * shard_len,
node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault,
node.out_stream); node.out_stream);
cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
node.val_storage + sizeof(int) * shard_len,
sizeof(int) * shard_len, cudaMemcpyDefault,
node.out_stream);
} }
for (int i = 0; i < gpu_num; ++i) { for (int i = 0; i < gpu_num; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
...@@ -131,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( ...@@ -131,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
TODO: TODO:
how to optimize it to eliminate the for loop how to optimize it to eliminate the for loop
*/ */
__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, __global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size,
int* d_shard_actual_sample_size, int* d_actual_sample_size,
int* d_actual_sample_size, int* idx, int* idx, int len) {
int sample_size, int len) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i]; d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
// d_vals[idx[i]] = d_shard_vals[i]; }
for (int j = 0; j < sample_size; j++) { }
d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
template <int BLOCK_WARPS, int TILE_SIZE>
__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals,
int64_t* d_vals,
int* d_actual_sample_size, int* idx,
int* offset, int* d_offset,
int len) {
assert(blockDim.x == WARP_SIZE);
assert(blockDim.y == BLOCK_WARPS);
int i = blockIdx.x * TILE_SIZE + threadIdx.y;
const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
while (i < last_idx) {
const int sample_size = d_actual_sample_size[idx[i]];
for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j];
} }
#ifdef PADDLE_WITH_CUDA
__syncwarp();
#endif
i += BLOCK_WARPS;
} }
} }
...@@ -255,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -255,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
h_left = [0,5],h_right = [4,8] h_left = [0,5],h_right = [4,8]
*/ */
NeighborSampleResult* result = new NeighborSampleResult(sample_size, len); NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
if (len == 0) { if (len == 0) {
return result; return result;
} }
cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
int* actual_sample_size = result->actual_sample_size;
int64_t* val = result->val;
int total_gpu = resource_->total_gpu(); int total_gpu = resource_->total_gpu();
int dev_id = resource_->dev_id(gpu_id); int dev_id = resource_->dev_id(gpu_id);
platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id);
...@@ -287,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -287,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr()); int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
int* d_shard_actual_sample_size_ptr =
reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id); split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
...@@ -331,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -331,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
of alloc_mem_i, actual_sample_size_of_x equals ((int of alloc_mem_i, actual_sample_size_of_x equals ((int
*)alloc_mem_i)[shard_len + x] *)alloc_mem_i)[shard_len + x]
*/ */
create_storage(gpu_id, i, shard_len * sizeof(int64_t), create_storage(gpu_id, i, shard_len * sizeof(int64_t),
shard_len * (1 + sample_size) * sizeof(int64_t)); shard_len * (1 + sample_size) * sizeof(int64_t));
} }
...@@ -351,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -351,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
h_right[i] - h_left[i] + 1, h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, gpu_id)); resource_->remote_stream(i, gpu_id));
} }
for (int i = 0; i < total_gpu; ++i) { for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) { if (h_left[i] == -1) {
continue; continue;
...@@ -364,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -364,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
int* res_array = reinterpret_cast<int*>(node.val_storage); int* res_array = reinterpret_cast<int*>(node.val_storage);
int* actual_size_array = res_array + shard_len; int* actual_size_array = res_array + shard_len;
int64_t* sample_array = (int64_t*)(res_array + shard_len * 2); int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
neighbor_sample_example<<<grid_size, block_size_, 0,
resource_->remote_stream(i, gpu_id)>>>( // 1. get actual_size_array.
graph, res_array, actual_size_array, sample_array, sample_size, // 2. get sum of actual_size.
shard_len); // 3. get offset ptr
thrust::device_vector<int> t_res_array(shard_len);
thrust::copy(res_array, res_array + shard_len, t_res_array.begin());
thrust::device_vector<int> t_actual_size_array(shard_len);
thrust::transform(t_res_array.begin(), t_res_array.end(),
t_actual_size_array.begin(), DegreeFunctor(graph));
if (sample_size >= 0) {
thrust::transform(t_actual_size_array.begin(), t_actual_size_array.end(),
t_actual_size_array.begin(), MaxFunctor(sample_size));
}
thrust::copy(t_actual_size_array.begin(), t_actual_size_array.end(),
actual_size_array);
int total_sample_sum =
thrust::reduce(t_actual_size_array.begin(), t_actual_size_array.end());
thrust::device_vector<int> output_idx(total_sample_sum);
thrust::device_vector<int> output_offset(shard_len);
thrust::exclusive_scan(t_actual_size_array.begin(),
t_actual_size_array.end(), output_offset.begin(), 0);
constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
constexpr int TILE_SIZE = BLOCK_WARPS * 16;
const dim3 block_(WARP_SIZE, BLOCK_WARPS);
const dim3 grid_((shard_len + TILE_SIZE - 1) / TILE_SIZE);
neighbor_sample<
BLOCK_WARPS,
TILE_SIZE><<<grid_, block_, 0, resource_->remote_stream(i, gpu_id)>>>(
0, graph, sample_size, res_array, shard_len, sample_array,
thrust::raw_pointer_cast(output_idx.data()),
thrust::raw_pointer_cast(output_offset.data()));
} }
for (int i = 0; i < total_gpu; ++i) { for (int i = 0; i < total_gpu; ++i) {
...@@ -378,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -378,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
tables_[i]->rwlock_->UNLock(); tables_[i]->rwlock_->UNLock();
} }
// walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
h_left, h_right, d_shard_vals_ptr,
d_shard_actual_sample_size_ptr);
fill_dvalues<<<grid_size, block_size_, 0, stream>>>( auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size, int* d_shard_actual_sample_size_ptr =
d_idx_ptr, sample_size, len); reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
// Store total sample number of each gpu.
thrust::host_vector<int> d_shard_total_sample_size(total_gpu, 0);
move_neighbor_sample_size_to_source_gpu(
gpu_id, total_gpu, h_left, h_right, d_shard_actual_sample_size_ptr,
thrust::raw_pointer_cast(d_shard_total_sample_size.data()));
int allocate_sample_num = 0;
for (int i = 0; i < total_gpu; ++i) {
allocate_sample_num += d_shard_total_sample_size[i];
}
auto d_shard_vals =
memory::Alloc(place, allocate_sample_num * sizeof(int64_t));
int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, h_left, h_right,
d_shard_vals_ptr,
d_shard_total_sample_size);
cudaMalloc((void**)&result->val, allocate_sample_num * sizeof(int64_t));
cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
cudaMalloc((void**)&result->offset, len * sizeof(int));
int64_t* val = result->val;
int* actual_sample_size = result->actual_sample_size;
int* offset = result->offset;
fill_dvalues_actual_sample_size<<<grid_size, block_size_, 0, stream>>>(
d_shard_actual_sample_size_ptr, actual_sample_size, d_idx_ptr, len);
thrust::device_vector<int> t_actual_sample_size(len);
thrust::copy(actual_sample_size, actual_sample_size + len,
t_actual_sample_size.begin());
thrust::exclusive_scan(t_actual_sample_size.begin(),
t_actual_sample_size.end(), offset, 0);
int* d_offset;
cudaMalloc(&d_offset, len * sizeof(int));
thrust::copy(d_shard_actual_sample_size_ptr,
d_shard_actual_sample_size_ptr + len,
t_actual_sample_size.begin());
thrust::exclusive_scan(t_actual_sample_size.begin(),
t_actual_sample_size.end(), d_offset, 0);
constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE;
constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
const dim3 block__(WARP_SIZE, BLOCK_WARPS_);
const dim3 grid__((len + TILE_SIZE_ - 1) / TILE_SIZE_);
fill_dvalues_sample_result<BLOCK_WARPS_,
TILE_SIZE_><<<grid__, block__, 0, stream>>>(
d_shard_vals_ptr, val, actual_sample_size, d_idx_ptr, offset, d_offset,
len);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
for (int i = 0; i < total_gpu; ++i) { for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
...@@ -393,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, ...@@ -393,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
} }
destroy_storage(gpu_id, i); destroy_storage(gpu_id, i);
} }
cudaFree(d_offset);
return result; return result;
} }
......
...@@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) { ...@@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) {
0 --index--->0 0 --index--->0
7 --index-->2 7 --index-->2
*/ */
int64_t cpu_key[3] = {7, 0, 6}; int64_t cpu_key[3] = {7, 0, 6};
void *key; void *key;
cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMalloc((void **)&key, 3 * sizeof(int64_t));
cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
res = new int64_t[9]; res = new int64_t[7];
cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; int *actual_sample_size = new int[3];
for (int i = 0; i < 9; i++) { cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
if (expected_sample_val[i] != -1) { cudaMemcpyDeviceToHost); // 3, 1, 3
ASSERT_EQ(res[i], expected_sample_val[i]); int *cumsum_sample_size = new int[3];
cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
cudaMemcpyDeviceToHost); // 0, 3, 4
std::vector<std::vector<int64_t>> neighbors_;
std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
std::vector<int64_t> neighbors_0 = {0};
std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
neighbors_.push_back(neighbors_7);
neighbors_.push_back(neighbors_0);
neighbors_.push_back(neighbors_6);
for (int i = 0; i < 3; i++) {
for (int j = cumsum_sample_size[i];
j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
bool flag = false;
for (int k = 0; k < neighbors_[i].size(); k++) {
if (res[j] == neighbors_[i][k]) {
flag = true;
break;
}
}
ASSERT_EQ(flag, true);
} }
} }
delete[] res; delete[] res;
delete[] actual_sample_size;
delete[] cumsum_sample_size;
delete neighbor_sample_res; delete neighbor_sample_res;
} }
...@@ -25,14 +25,14 @@ std::set<std::string> ignored_ops = { ...@@ -25,14 +25,14 @@ std::set<std::string> ignored_ops = {
"sum", "sum",
"clip", "clip",
"clip_by_norm", "clip_by_norm",
"square",
"reduce_sum", "reduce_sum",
"sqrt", "sqrt",
"elementwise_max", "elementwise_max",
"elementwise_div", "elementwise_div",
"elementwise_mul", "elementwise_mul",
"scale", // adamax "scale", // adamax
"assign", // adamw "assign", // adamw
"squared_l2_norm" // gradient_clip_norm
}; };
const bool startswith(const std::string& str, const std::string& pre) { const bool startswith(const std::string& str, const std::string& pre) {
...@@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { ...@@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
new_op.SetAttr("with_lr_sched", false); new_op.SetAttr("with_lr_sched", false);
std::set<std::string> set_ops{}; std::set<std::string> set_ops{};
// save the weight decay tensor_name and weight_decay_value for Lamb
std::vector<std::string> weight_decay_vars{};
std::vector<float> weight_decay_values{};
// use map store <op_type, op_ptr> ? // use map store <op_type, op_ptr> ?
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (!node->IsOp()) { if (!node->IsOp()) {
...@@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { ...@@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
auto op_role = static_cast<OpRole>(op_role_); auto op_role = static_cast<OpRole>(op_role_);
if (op_role == OpRole::kOptimize) { if (op_role == OpRole::kOptimize) {
// save weight decay value from every lamb optimizer op
if (op_type == "lamb" && op->HasAttr("weight_decay")) {
auto weight_decay_value =
BOOST_GET_CONST(float, op->GetAttr("weight_decay"));
auto params = op->Output("ParamOut");
weight_decay_vars.push_back(params[0]);
weight_decay_values.push_back(weight_decay_value);
}
if (set_ops.count(op_type)) { if (set_ops.count(op_type)) {
continue; continue;
} }
...@@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { ...@@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
// seems with_lr_sched is always true // seems with_lr_sched is always true
new_op.SetAttr("with_lr_sched", true); new_op.SetAttr("with_lr_sched", true);
// setup weight deacy // setup weight decay for Lamb
new_op.SetAttr("weight_decay_vars", weight_decay_vars);
new_op.SetAttr("weight_decay_values", weight_decay_values);
// weight_decay/coeff is "scale" attr of scale_op // weight_decay/coeff is "scale" attr of scale_op
if (set_ops.count("scale") && set_ops.count("sum")) { if (set_ops.count("scale") && set_ops.count("sum")) {
if (set_ops.count("sign")) { if (set_ops.count("sign")) {
......
...@@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const { ...@@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16; auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
if (enable_fp16) { auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
if (enable_fp16 && transfer_cast_op) {
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "popart_cast") { if (node->IsOp() && node->Op()->Type() == "popart_cast") {
if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) == if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
USE_OP_ITSELF(batch_norm); USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose); USE_OP_ITSELF(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
......
...@@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, ...@@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (platform::is_cpu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (platform::is_ipu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
...@@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, ...@@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
"Copying from %s to %s is not supported.", src_place, dst_place)); "Copying from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
} }
template <typename TENSOR> template <typename TENSOR>
...@@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
} } // NOLINT
else if (platform::is_cpu_place(src_place) && // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
} } // NOLINT
else if (platform::is_custom_place(src_place) && // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_custom_place( platform::is_custom_place(
dst_place)) { /* custom_device -> custom_device*/ dst_place)) { /* custom_device -> custom_device*/
...@@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
else if (platform::is_xpu_place(src_place) && // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} } // NOLINT
else if (platform::is_cpu_place(src_place) && // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) { platform::is_xpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} } // NOLINT
else if (platform::is_xpu_place(src_place) && // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) { platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
...@@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place); auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
xpu_ctx->Wait(); xpu_ctx->Wait();
} }
} } // NOLINT
else { // NOLINT else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
...@@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
} }
template <typename Predicate, typename DevCtx> template <typename Predicate, typename DevCtx>
......
...@@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() { ...@@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() {
if (find_unused_vars_each_step_) { if (find_unused_vars_each_step_) {
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_CNCL)
ProcessUnusedDenseVars(); ProcessUnusedDenseVars();
#endif #endif
// Initialize local used vars // Initialize local used vars
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_OP_ITSELF(conv2d); USE_OP_ITSELF(conv2d);
USE_OP(conv2d_transpose); USE_OP_ITSELF(conv2d_transpose);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor<void> { ...@@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor<void> {
out_var_->GetMutable<framework::LoDTensor>(); out_var_->GetMutable<framework::LoDTensor>();
if (platform::is_same_place(in_tensor.place(), place_)) { if (platform::is_same_place(in_tensor.place(), place_)) {
out_tensor->ShareDataWith(in_tensor); out_tensor->ShareDataWith(in_tensor);
#ifdef PADDLE_WITH_IPU
} else if (platform::is_ipu_place(place_)) {
// For ipu, both in_tensor and out_tensor are allocated on cpu,
// PopART will copy tensor from host automatically,
// no TensorCopy() is required here.
out_tensor->ShareDataWith(in_tensor);
#endif
} else { } else {
platform::DeviceContext *context = platform::DeviceContext *context =
platform::DeviceContextPool::Instance().Get(place_); platform::DeviceContextPool::Instance().Get(place_);
......
...@@ -19,14 +19,16 @@ namespace operators { ...@@ -19,14 +19,16 @@ namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class GemmConvXPUKernel : public framework::OpKernel<T> { class GemmConvXPUKernel : public framework::OpKernel<T> {
using XPUT = typename XPUTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor *input = context.Input<Tensor>("Input");
// The filter will be reshaped in the calculations, // The filter will be reshaped in the calculations,
// so here use an assignment operation, // so here use an assignment operation,
// that avoids modifying the variable in the Scope. // that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter"); Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output"); Tensor *output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups"); int groups = context.Attr<int>("groups");
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
...@@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> { ...@@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
const int img_h = static_cast<int>(input->dims()[2]); const int img_h = static_cast<int>(input->dims()[2]);
const int img_w = static_cast<int>(input->dims()[3]); const int img_w = static_cast<int>(input->dims()[3]);
const int f = static_cast<int>(filter.dims()[0]); const int f = static_cast<int>(filter.dims()[0]);
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::conv2d<float, float, float, int16_t>( const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
dev_ctx.x_context(), input->data<float>(), filter.data<float>(), const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
output->data<float>(), batch_size, img_c, img_h, img_w, f, ksize, XPUT *output_data = reinterpret_cast<XPUT *>(output->data<T>());
strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
auto &dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::conv2d<XPUT, XPUT, XPUT, int16_t>(
dev_ctx.x_context(), input_data, filter_data, output_data, batch_size,
img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups,
nullptr, nullptr, nullptr, true);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d %s]", platform::errors::External("XPU conv kernel return wrong value[%d %s]",
...@@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> { ...@@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class GemmConvGradXPUKernel : public framework::OpKernel<T> { class GemmConvGradXPUKernel : public framework::OpKernel<T> {
using XPUT = typename XPUTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor *input = context.Input<Tensor>("Input");
const Tensor* output_grad = const Tensor *output_grad =
context.Input<Tensor>(framework::GradVarName("Output")); context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = Tensor *input_grad =
context.Output<Tensor>(framework::GradVarName("Input")); context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = Tensor *filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter")); context.Output<Tensor>(framework::GradVarName("Filter"));
// The filter and filter_grad will be reshaped in the calculations, // The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation, // so here use an assignment operation,
...@@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> { ...@@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
const int img_h = static_cast<int>(input->dims()[2]); const int img_h = static_cast<int>(input->dims()[2]);
const int img_w = static_cast<int>(input->dims()[3]); const int img_w = static_cast<int>(input->dims()[3]);
const int f = static_cast<int>(filter.dims()[0]); const int f = static_cast<int>(filter.dims()[0]);
const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
const XPUT *output_grad_data =
reinterpret_cast<const XPUT *>(output_grad->data<T>());
XPUT *input_grad_data = nullptr;
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
input_grad_data = reinterpret_cast<XPUT *>(input_grad->data<T>());
} }
XPUT *filter_grad_data = nullptr;
if (filter_grad) { if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
filter_grad_data = reinterpret_cast<XPUT *>(filter_grad->data<T>());
} }
auto& dev_ctx = context.template device_context<DeviceContext>(); auto &dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::conv2d_grad<float, float, float, int16_t>( int r = xpu::conv2d_grad<XPUT, XPUT, XPUT, int16_t>(
dev_ctx.x_context(), input->data<T>(), filter.data<T>(), dev_ctx.x_context(), input_data, filter_data, output_grad_data,
output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr, input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f,
filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c, ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr,
img_h, img_w, f, ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
nullptr, nullptr, nullptr, nullptr, true);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d %s]", platform::errors::External("XPU conv kernel return wrong value[%d %s]",
...@@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> { ...@@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
depthwise_conv2d, conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
REGISTER_OP_XPU_KERNEL( paddle::platform::float16>);
conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
conv2d_grad, conv2d_grad,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_XPU_KERNEL(
depthwise_conv2d,
ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
depthwise_conv2d_grad, depthwise_conv2d_grad,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
paddle::platform::float16>);
#endif #endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int D>
static void DataTranspose(const framework::ExecutionContext& ctx,
const Tensor* input, Tensor* output,
const std::vector<int>& axis, int flag = 0) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
phi::funcs::Transpose<platform::CUDADeviceContext, T, D> transpose;
auto in_dims = input->dims();
std::vector<int64_t> input_transpose_vec;
for (size_t i = 0; i < axis.size(); ++i) {
if (flag == 0)
input_transpose_vec.push_back(in_dims[axis[i]]);
else
input_transpose_vec.push_back(in_dims[i]);
}
framework::DDim input_transpose_dims(phi::make_ddim(input_transpose_vec));
output->mutable_data<T>(input_transpose_dims, ctx.GetPlace());
transpose(dev_ctx, *input, output, axis);
}
template <typename T>
class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
// cudnn v5 does not support dilations
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const T* filter_data = filter->data<T>();
const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first
Tensor input_transpose;
std::vector<int> input_vec = phi::vectorize<int>(input->dims());
std::vector<int> output_vec = phi::vectorize<int>(output->dims());
if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
input_vec[i] = input->dims()[axis[i]];
output_vec[i] = output->dims()[axis[i]];
}
DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 4, 1, 2, 3};
for (size_t i = 0; i < axis.size(); ++i) {
input_vec[i] = input->dims()[axis[i]];
output_vec[i] = output->dims()[axis[i]];
}
DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
}
} else {
input_transpose = *input;
}
// update padding and dilation
auto in_dims = input_transpose.dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = input_transpose.dims()[0];
new_input_shape_vec[1] = input_transpose.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
input_transpose.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
}
framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
transformed_input.Resize(new_input_shape);
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
transformed_input =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
const int rank = input_transpose.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
}
} else {
transformed_input = input_transpose;
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
std::vector<int64_t> starts(data_dim, 0);
std::vector<int64_t> ends(data_dim, 0);
std::vector<int64_t> axes(data_dim, 0);
for (size_t i = 0; i < data_dim; ++i) {
starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
ends[i] = starts[i] + output_vec[i + 2];
axes[i] = i + 2;
}
const T* input_data = transformed_input.data<T>();
input_vec = phi::vectorize<int>(transformed_input.dims());
std::vector<int> transformed_output_vec = output_vec;
for (size_t i = 0; i < data_dim; ++i) {
transformed_output_vec[i + 2] =
output_vec[i + 2] +
(input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
}
Tensor transformed_output;
if (!is_sys_pad) {
DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
transformed_output.mutable_data<T>(transformed_output_shape,
ctx.GetPlace());
} else {
output->mutable_data<T>(ctx.GetPlace());
transformed_output.ShareDataWith(*output);
transformed_output.Resize(phi::make_ddim(transformed_output_vec));
}
T* transformed_output_data = transformed_output.data<T>();
platform::DataLayout layout;
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
if (strides.size() == 2U) {
layout = platform::DataLayout::kNCHW;
} else {
layout = platform::DataLayout::kNCDHW;
}
size_t workspace_size = 0;
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t algo{};
#else
cudnnConvolutionBwdDataAlgo_t algo{};
#endif
// ------------------- cudnn conv algorithm ---------------------
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto layout_tensor = GetCudnnTensorFormat(layout);
bool deterministic = FLAGS_cudnn_deterministic;
auto dtype = platform::CudnnDataType<T>::type;
// ------------------- cudnn descriptors ---------------------
ConvArgs args{&transformed_output,
filter,
&transformed_input,
strides,
padding_common,
dilations,
dtype};
args.handle = handle;
args.idesc.set(transformed_output, iwo_groups);
args.wdesc.set(*filter, layout_tensor, iwo_groups);
args.odesc.set(transformed_input, iwo_groups);
args.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(
args, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(
args, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo));
#endif
// ------------------- cudnn conv transpose forward ---------------------
int input_offset =
transformed_input.numel() / transformed_input.dims()[0] / groups;
int output_offset =
transformed_output.numel() / transformed_output.dims()[0] / groups;
int filter_offset = filter->numel() / groups;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args.odesc.desc(),
input_data + input_offset * g, args.wdesc.desc(),
filter_data + filter_offset * g, args.cdesc.desc(), algo, &beta,
args.idesc.desc(), transformed_output_data + output_offset * g,
cudnn_workspace, workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, args.wdesc.desc(),
filter_data + filter_offset * g, args.odesc.desc(),
input_data + input_offset * g, args.cdesc.desc(), algo,
cudnn_workspace, workspace_size, &beta, args.idesc.desc(),
transformed_output_data + output_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (!is_sys_pad && strides.size() == 2U) {
Slice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_output, output, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
Slice<paddle::platform::CUDADeviceContext, T, 5>(
ctx, &transformed_output, output, starts, ends, axes);
}
if (data_layout == platform::DataLayout::kNHWC) {
Tensor output_transpose;
Tensor output_nchw;
output_nchw.ShareDataWith(*output);
output_nchw.Resize(phi::make_ddim(output_vec));
if (strides.size() == 2U) {
std::vector<int> axis = {0, 2, 3, 1};
DataTranspose<T, 4>(ctx, &output_nchw, &output_transpose, axis);
*output = output_transpose;
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 2, 3, 4, 1};
DataTranspose<T, 5>(ctx, &output_nchw, &output_transpose, axis);
*output = output_transpose;
}
}
}
};
template <typename T>
class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const T* filter_data = filter->data<T>();
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
// cudnn v5 does not support dilations
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first
Tensor input_transpose;
Tensor output_grad_transpose;
std::vector<int> input_vec = phi::vectorize<int>(input->dims());
std::vector<int> output_vec = phi::vectorize<int>(output_grad->dims());
if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
input_vec[i] = input->dims()[axis[i]];
output_vec[i] = output_grad->dims()[axis[i]];
}
DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
DataTranspose<T, 4>(ctx, output_grad, &output_grad_transpose, axis);
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 4, 1, 2, 3};
for (size_t i = 0; i < axis.size(); ++i) {
input_vec[i] = input->dims()[axis[i]];
output_vec[i] = output_grad->dims()[axis[i]];
}
DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
DataTranspose<T, 5>(ctx, output_grad, &output_grad_transpose, axis);
}
} else {
input_transpose = *input;
output_grad_transpose = *output_grad;
}
// update padding and dilation
auto in_dims = input_transpose.dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_output_grad;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_output_grad_shape_vec(data_dim + 2);
new_output_grad_shape_vec[0] = output_grad_transpose.dims()[0];
new_output_grad_shape_vec[1] = output_grad_transpose.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_output_grad_shape_vec[i + 2] =
output_grad_transpose.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
}
framework::DDim new_output_grad_shape(
phi::make_ddim(new_output_grad_shape_vec));
transformed_output_grad.Resize(new_output_grad_shape);
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
transformed_output_grad =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_output_grad_shape, dev_ctx);
const int rank = input_transpose.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad);
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad);
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
}
} else {
transformed_output_grad = output_grad_transpose;
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
const T* input_data = input_transpose.data<T>();
const T* output_grad_data = transformed_output_grad.data<T>();
output_vec = phi::vectorize<int>(transformed_output_grad.dims());
// ------------------- cudnn descriptors ---------------------
platform::DataLayout layout;
if (strides.size() == 2U) {
layout = platform::DataLayout::kNCHW;
} else {
layout = platform::DataLayout::kNCDHW;
}
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
auto dtype = platform::CudnnDataType<T>::type;
ConvArgs args1{&transformed_output_grad,
filter,
&input_transpose,
strides,
padding_common,
dilations,
dtype};
ConvArgs args2{&transformed_output_grad,
filter,
&input_transpose,
strides,
padding_common,
dilations,
dtype};
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t data_algo{};
miopenConvBwdWeightsAlgorithm_t filter_algo{};
#else
cudnnConvolutionFwdAlgo_t data_algo{};
cudnnConvolutionBwdFilterAlgo_t filter_algo{};
#endif
auto layout_tensor = GetCudnnTensorFormat(layout);
size_t workspace_size = 0;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
bool deterministic = FLAGS_cudnn_deterministic;
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
args1.handle = handle;
args1.idesc.set(transformed_output_grad, iwo_groups);
args1.wdesc.set(*filter, layout_tensor, iwo_groups);
args1.odesc.set(input_transpose, iwo_groups);
args1.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(
args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
args2.handle = handle;
args2.idesc.set(transformed_output_grad, iwo_groups);
args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
args2.odesc.set(input_transpose, iwo_groups);
args2.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(
args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo));
#endif
}
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
int input_offset = input->numel() / input->dims()[0] / groups;
int output_grad_offset = transformed_output_grad.numel() /
transformed_output_grad.dims()[0] / groups;
int filter_offset = filter->numel() / groups;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) {
// Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
handle, &alpha, args1.idesc.desc(),
output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
filter_data + filter_offset * g, args1.cdesc.desc(),
data_algo, &beta, args1.odesc.desc(),
input_grad_data + input_offset * g, cudnn_workspace,
workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
handle, &alpha, args1.idesc.desc(),
output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
filter_data + filter_offset * g, args1.cdesc.desc(), data_algo,
cudnn_workspace, workspace_size, &beta, args1.odesc.desc(),
input_grad_data + input_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (data_layout == platform::DataLayout::kNHWC) {
Tensor input_grad_transpose;
Tensor input_grad_nchw;
input_grad_nchw.ShareDataWith(*input_grad);
input_grad_nchw.Resize(phi::make_ddim(input_vec));
if (strides.size() == 2U) {
std::vector<int> axis = {0, 2, 3, 1};
DataTranspose<T, 4>(ctx, &input_grad_nchw, &input_grad_transpose,
axis);
*input_grad = input_grad_transpose;
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 2, 3, 4, 1};
DataTranspose<T, 5>(ctx, &input_grad_nchw, &input_grad_transpose,
axis);
*input_grad = input_grad_transpose;
}
}
}
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args2.odesc.desc(),
input_data + input_offset * g, args2.idesc.desc(),
output_grad_data + output_grad_offset * g, args2.cdesc.desc(),
filter_algo, &beta, args2.wdesc.desc(),
filter_grad_data + filter_offset * g, cudnn_workspace,
workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, args2.idesc.desc(),
output_grad_data + output_grad_offset * g, args2.odesc.desc(),
input_data + input_offset * g, args2.cdesc.desc(),
filter_algo, cudnn_workspace, workspace_size, &beta,
args2.wdesc.desc(), filter_grad_data + filter_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
}
}
};
/*
* Inputs: I, W, dO, ddI, ddW
* Outputs: ddO, dW, dI
* ddo = conv_bp_data(W, ddI) + conv_bp_data(ddW, I)
* dW = conv_bp_filter(dO, ddI)
* dI = conv(dO, ddW)
*/
template <typename T>
class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
auto X = ctx.Input<Tensor>("Input");
auto W = ctx.Input<Tensor>("Filter");
auto dO = ctx.Input<Tensor>("DOutput");
auto ddX = ctx.Input<Tensor>("DDInput");
auto ddW = ctx.Input<Tensor>("DDFilter");
auto ddO = ctx.Output<Tensor>("DDOutput");
auto dW = ctx.Output<Tensor>("DFilter");
auto dX = ctx.Output<Tensor>("DInput");
if (ddO) {
ddO->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(dev_ctx, ddO, static_cast<T>(0));
}
if (dW) {
dW->mutable_data<T>(ctx.GetPlace());
}
if (dX) {
dX->mutable_data<T>(ctx.GetPlace());
}
const T* dy = dO->data<T>();
const T* w = W->data<T>();
const T* ddx = nullptr;
const T* ddw = nullptr;
T *dw, *dx, *ddy;
dw = dx = ddy = nullptr;
T* transformed_dx = nullptr;
const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool deterministic = FLAGS_cudnn_deterministic;
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform Tensors to channel first-----------
Tensor transformed_X_channel(X->type());
Tensor transformed_dO_channel(dO->type());
Tensor transformed_ddX_channel(X->type());
Tensor transformed_ddO_channel(dO->type());
Tensor transformed_dX_channel(X->type());
if (channel_last) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, X, &transformed_X_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, X, &transformed_X_channel);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dO, &transformed_dO_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dO, &transformed_dO_channel);
if (ddX) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddX, &transformed_ddX_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddX, &transformed_ddX_channel);
}
if (ddO) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, ddO, &transformed_ddO_channel);
}
if (dX) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, dX, &transformed_dX_channel);
transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
}
} else {
transformed_X_channel = *X;
transformed_dO_channel = *dO;
if (ddX) {
transformed_ddX_channel = *ddX;
}
if (dX) {
transformed_dX_channel = *dX;
}
}
std::vector<int> output_vec =
phi::vectorize<int>(transformed_dO_channel.dims());
auto in_dims = transformed_X_channel.dims();
auto filter_dims = W->dims();
framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type());
Tensor transformed_dO(dO->type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(X->dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
std::vector<int> new_output_grad_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_X_channel.dims()[0];
new_input_shape_vec[1] = transformed_X_channel.dims()[1];
new_output_grad_shape_vec[0] = transformed_dO_channel.dims()[0];
new_output_grad_shape_vec[1] = transformed_dO_channel.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_X_channel.dims()[i + 2] + padding_diff[i];
new_output_grad_shape_vec[i + 2] =
transformed_dO_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
}
framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
transformed_X.Resize(new_input_shape);
transformed_ddX.Resize(new_input_shape);
framework::DDim new_output_grad_shape(
phi::make_ddim(new_output_grad_shape_vec));
transformed_dO.Resize(new_output_grad_shape);
transformed_dO =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_output_grad_shape, dev_ctx);
transformed_X =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
if (ddX) {
transformed_ddX =
ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
new_input_shape, dev_ctx);
}
// pad for input
const int rank = X->dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (dO) {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_dO_channel, pad_value,
&transformed_dO);
}
if (ddX) {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
case 5: {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_X = transformed_X_channel;
transformed_dO = transformed_dO_channel;
if (ddX) {
transformed_ddX = transformed_ddX_channel;
}
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[2 * i];
}
}
}
std::vector<int64_t> starts(data_dim, 0);
std::vector<int64_t> ends(data_dim, 0);
std::vector<int64_t> axes(data_dim, 0);
for (size_t i = 0; i < data_dim; ++i) {
starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
ends[i] = starts[i] + output_vec[i + 2];
axes[i] = i + 2;
}
std::vector<int> transformed_output_vec = output_vec;
for (size_t i = 0; i < data_dim; ++i) {
transformed_output_vec[i + 2] =
output_vec[i + 2] +
(input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
}
if (!is_sys_pad) {
DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
transformed_ddO_channel.mutable_data<T>(transformed_output_shape,
ctx.GetPlace());
} else {
ddO->mutable_data<T>(ctx.GetPlace());
transformed_ddO_channel = *ddO;
transformed_ddO_channel.Resize(phi::make_ddim(transformed_output_vec));
}
const T* x = transformed_X.data<T>();
int iwo_group = groups;
int c_group = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_group = 1;
c_group = groups;
groups = 1;
#endif
auto dtype = platform::CudnnDataType<T>::type;
auto handle = dev_ctx.cudnn_handle();
ConvArgs args1{&transformed_ddO_channel,
W,
&transformed_ddX,
strides,
padding_common,
dilations,
dtype};
ConvArgs args2{&transformed_ddO_channel, ddW, &transformed_X, strides,
padding_common, dilations, dtype};
ConvArgs args3{&transformed_dO,
dW,
&transformed_ddX_channel,
strides,
padding_common,
dilations,
dtype};
ConvArgs args4{
&transformed_dO, ddW, &transformed_dX_channel, strides, padding_common,
dilations, dtype};
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t bwd_algo1 =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdDataAlgorithm_t bwd_algo2 =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvFwdAlgorithm_t data_algo =
static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionFwdAlgo_t data_algo =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW);
// ddo = conv(ddI, W) + conv(I, ddW)
size_t workspace_size = 0;
T* transformed_ddy_channel = nullptr;
if (ddO) {
ddy = ddO->data<T>();
transformed_ddy_channel = transformed_ddO_channel.data<T>();
if (ddX) {
args1.handle = handle;
args1.idesc.set(transformed_ddO_channel, iwo_group);
args1.wdesc.set(*W, layout, iwo_group);
args1.odesc.set(transformed_ddX, iwo_group);
args1.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 = search1::Find<T>(
args1, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo1 = search1::Find<T>(
args1, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
#endif
}
if (ddW) {
ddw = ddW->data<T>();
args2.handle = handle;
args2.idesc.set(transformed_ddO_channel, iwo_group);
args2.wdesc.set(*ddW, layout, iwo_group);
args2.odesc.set(transformed_X, iwo_group);
args2.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 = search2::Find<T>(
args2, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo2 = search2::Find<T>(
args2, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, bwd_algo2));
#endif
}
}
if (dW && ddX) {
dw = dW->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_dO, iwo_group);
args3.wdesc.set(*dW, layout, iwo_group);
args3.odesc.set(transformed_ddX_channel, iwo_group);
args3.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = search3::Find<T>(
args3, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search3::Find<T>(
args3, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo));
#endif
}
if (ddW && dX) {
transformed_dx = transformed_dX_channel.data<T>();
args4.handle = handle;
args4.idesc.set(transformed_dO, iwo_group);
args4.wdesc.set(*ddW, layout, iwo_group);
args4.odesc.set(transformed_dX_channel, iwo_group);
args4.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = search4::Find<T>(
args4, false, deterministic, workspace_size,
ctx.template device_context<platform::CUDADeviceContext>());
#else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search4::Find<T>(
args4, false, deterministic,
ctx.template device_context<platform::CUDADeviceContext>());
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
}
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c,
&i_d, &i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c,
&o_d, &o_h, &o_w);
int group_offset_in =
transformed_X.numel() / transformed_X.dims()[0] / groups;
int group_offset_out =
transformed_dO.numel() / transformed_dO.dims()[0] / groups;
int group_offset_filter = W->numel() / groups;
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
if (ddO) {
if (ddX) {
ddx = transformed_ddX.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args1.odesc.desc(),
ddx + i * group_offset_in, args1.wdesc.desc(),
w + i * group_offset_filter, args1.cdesc.desc(),
bwd_algo1, &beta, args1.idesc.desc(),
transformed_ddy_channel + i * group_offset_out,
workspace_ptr, workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, args1.wdesc.desc(),
w + i * group_offset_filter, args1.odesc.desc(),
ddx + i * group_offset_in, args1.cdesc.desc(),
bwd_algo1, workspace_ptr, workspace_size, &beta,
args1.idesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
}
if (ddW) {
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
Tensor conv_x_ddw(dO->type());
conv_x_ddw.Resize(transformed_ddO_channel.dims());
T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args2.odesc.desc(),
x + i * group_offset_in, args2.wdesc.desc(),
ddw + i * group_offset_filter, args2.cdesc.desc(),
bwd_algo2, &beta, args2.idesc.desc(),
conv_x_ddw_data + i * group_offset_out, workspace_ptr,
workspace_size));
},
workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
transformed_ddy_channel + i * group_offset_out, &alpha,
args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
args2.idesc.desc(),
transformed_ddy_channel + i * group_offset_out));
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, args2.wdesc.desc(),
ddw + i * group_offset_filter, args2.odesc.desc(),
x + i * group_offset_in, args2.cdesc.desc(), bwd_algo2,
workspace_ptr, workspace_size, &alpha,
args2.idesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
}
if ((!is_sys_pad) && (!channel_last)) {
if (strides.size() == 2U) {
Slice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
Slice<paddle::platform::CUDADeviceContext, T, 5>(
ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
}
} else if ((!is_sys_pad) && (channel_last)) {
if (strides.size() == 2U) {
Slice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
Slice<paddle::platform::CUDADeviceContext, T, 5>(
ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
ends, axes);
}
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_ddO_channel, ddO);
}
}
T* transformed_dy_channel = transformed_dO.data<T>();
if (dW && ddX) {
ddx = transformed_ddX_channel.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args3.odesc.desc(),
ddx + i * group_offset_in, args3.idesc.desc(),
transformed_dy_channel + i * group_offset_out,
args3.cdesc.desc(), filter_algo, &beta,
args3.wdesc.desc(), dw + i * group_offset_filter,
workspace_ptr, workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, args3.idesc.desc(),
transformed_dy_channel + i * group_offset_out,
args3.odesc.desc(), ddx + i * group_offset_in,
args3.cdesc.desc(), filter_algo, workspace_ptr,
workspace_size, &beta, args3.wdesc.desc(),
dw + i * group_offset_filter));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
}
if (dX && ddW) {
ddw = ddW->data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenConvolutionForward(
handle, &alpha, args4.idesc.desc(),
transformed_dy_channel + i * group_offset_out,
args4.wdesc.desc(), ddw + i * group_offset_filter,
args4.cdesc.desc(), data_algo, &beta, args4.odesc.desc(),
transformed_dx + i * group_offset_in, workspace_ptr,
workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnConvolutionForward(
handle, &alpha, args4.idesc.desc(),
transformed_dy_channel + i * group_offset_out,
args4.wdesc.desc(), ddw + i * group_offset_filter,
args4.cdesc.desc(), data_algo, workspace_ptr,
workspace_size, &beta, args4.odesc.desc(),
transformed_dx + i * group_offset_in));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
if (channel_last) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_dX_channel, dX);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeOpKernel<plat::float16>,
ops::CUDNNConvTransposeOpKernel<float>);
REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
ops::CUDNNConvTransposeGradOpKernel<float>);
REGISTER_OP_KERNEL(
conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeOpKernel<plat::float16>,
ops::CUDNNConvTransposeOpKernel<float>);
REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
ops::CUDNNConvTransposeGradOpKernel<float>);
#else
REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeOpKernel<plat::float16>,
ops::CUDNNConvTransposeOpKernel<float>,
ops::CUDNNConvTransposeOpKernel<double>);
REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
ops::CUDNNConvTransposeGradOpKernel<float>,
ops::CUDNNConvTransposeGradOpKernel<double>);
REGISTER_OP_KERNEL(
conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeOpKernel<plat::float16>,
ops::CUDNNConvTransposeOpKernel<float>,
ops::CUDNNConvTransposeOpKernel<double>);
REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
ops::CUDNNConvTransposeGradOpKernel<float>,
ops::CUDNNConvTransposeGradOpKernel<double>);
#endif
...@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -29,165 +33,6 @@ namespace operators { ...@@ -29,165 +33,6 @@ namespace operators {
using DataLayout = framework::DataLayout; using DataLayout = framework::DataLayout;
void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ConvTranspose");
OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "ConvTranspose");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ConvTranspose");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> output_size =
ctx->Attrs().Get<std::vector<int>>("output_size");
std::vector<int> output_padding =
ctx->Attrs().Get<std::vector<int>>("output_padding");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
int groups = ctx->Attrs().Get<int>("groups");
std::string padding_algorithm =
ctx->Attrs().Get<std::string>("padding_algorithm");
const std::string data_layout_str =
ctx->Attrs().Get<std::string>("data_format");
const DataLayout data_layout =
ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW
: framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true,
platform::errors::InvalidArgument(
"Input of Op(conv_transpose) should be 4-D or "
"5-D Tensor. But received: %u-D Tensor, "
"the shape of input is [%s]",
in_dims.size(), in_dims));
PADDLE_ENFORCE_EQ(
in_dims.size(), filter_dims.size(),
platform::errors::InvalidArgument(
"The input's dimension size and filter's dimension size of "
"Op (conv_transpose) should be equal. But received: the shape of "
"input is [%s], the dimension size of input is [%d], the shape "
"of filter is [%s], the dimension size of filter is [%d]. ",
in_dims, in_dims.size(), filter_dims, filter_dims.size()));
int stride_size = strides.size();
for (int i = 0; i < stride_size; ++i) {
PADDLE_ENFORCE_GT(
strides[i], 0,
platform::errors::InvalidArgument(
"The stride of Op(Conv) should be larget than 0, but received "
"stride is %d.",
strides[i]));
}
int in_sub_stride_size = in_dims.size() - stride_size;
PADDLE_ENFORCE_EQ(
in_dims.size() - strides.size(), 2U,
platform::errors::InvalidArgument(
"The input's dimension size minus Attr(stride)'s size must "
"be euqal to 2 for Op(conv_transpose). But received: [%d], the "
"input's dimension size is [%d], the shape of input "
"is [%s], the Attr(stride)'s size is [%d].",
in_sub_stride_size, in_dims.size(), in_dims, strides.size()));
if (output_size.size())
PADDLE_ENFORCE_EQ(
output_size.size(), strides.size(),
platform::errors::InvalidArgument(
"The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
if (output_padding.size())
PADDLE_ENFORCE_EQ(
output_padding.size(), strides.size(),
platform::errors::InvalidArgument(
"The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
const int64_t C =
(data_layout != DataLayout::kNHWC ? in_dims[1]
: in_dims[in_dims.size() - 1]);
PADDLE_ENFORCE_EQ(
C, filter_dims[0],
platform::errors::InvalidArgument(
"The number of input channels should be equal to filter channels "
"for Op(conv_transpose). But received: the input's channels is "
"[%d], the shape of input is [%s], the filter's channels is [%d], "
"the shape of filter is [%s]. The data_format is %s."
"The error may come from wrong data_format setting.",
C, in_dims, filter_dims[0], filter_dims, data_layout_str));
framework::DDim in_data_dims;
if (data_layout != DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
std::vector<int64_t> output_shape({in_dims[0]});
if (data_layout != DataLayout::kNHWC) {
output_shape.push_back(filter_dims[1] * groups);
}
const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
auto infer_shape = (ctx->IsRuntime() || in_dims[i + offset] > 0)
? (in_dims[i + offset] - 1) * strides[i] -
paddings[2 * i] - paddings[2 * i + 1] +
filter_extent
: -1;
if (output_size.size()) {
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(
output_size[i], infer_shape,
platform::errors::InvalidArgument(
"output_size of Op(ConvTransposeOp) should not be "
"less than the infered output size. But received output_size = "
"[%s], whose dim %d is less than the infered output size [%s]",
phi::make_ddim(output_size).to_str(), i, infer_shape));
PADDLE_ENFORCE_LT(
output_size[i], infer_shape + strides[i],
platform::errors::InvalidArgument(
"output_size of Op(ConvTransposeOp) should be less "
"than infered size + stride. But received output_size = [%s], "
"whose dim %d is not less than the infered output size (%d) + "
"stride (%d) = %d",
phi::make_ddim(output_size).to_str(), i, infer_shape,
strides[i], infer_shape + strides[i]));
}
output_shape.push_back(output_size[i]);
} else if (output_padding.size()) {
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(
output_padding[i], 0,
platform::errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should not be "
"less than the 0. But received output_padding = "
"[%s], whose dim %d is less than 0",
phi::make_ddim(output_padding).to_str(), i));
PADDLE_ENFORCE_LT(
output_padding[i], std::max(strides[i], dilations[i]),
platform::errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should be less "
"than either stride or dilation. But received output_size = "
"[%s], "
"whose dim %d is not less than either stride (%d) or "
"dilation (%d)",
phi::make_ddim(output_size).to_str(), i, strides[i],
dilations[i]));
}
output_shape.push_back((infer_shape + output_padding[i]));
} else {
output_shape.push_back(infer_shape);
}
}
if (data_layout == DataLayout::kNHWC) {
output_shape.push_back(filter_dims[1] * groups);
}
ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
}
framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
...@@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( ...@@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
} }
framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar( framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const { const framework::OpKernelType& expected_kernel_type) const {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Only input require reshaping, weights and // Only input require reshaping, weights and
...@@ -493,17 +338,6 @@ Example: ...@@ -493,17 +338,6 @@ Example:
)DOC"); )DOC");
} }
void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
}
if (ctx->HasOutput(framework::GradVarName("Filter"))) {
ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
}
}
framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = bool use_cudnn =
...@@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
void ConvTransposeOpDoubleGrad::InferShape(
framework::InferShapeContext* ctx) const {
auto x_dims = ctx->GetInputDim("Input");
auto w_dims = ctx->GetInputDim("Filter");
auto do_dims = ctx->GetInputDim("DOutput");
if (ctx->HasOutput("DDOutput") &&
(ctx->HasInput("DDInput") || (ctx->HasInput("DDFilter")))) {
ctx->SetOutputDim("DDOutput", do_dims);
}
if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
ctx->SetOutputDim("DFilter", w_dims);
}
if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
ctx->SetOutputDim("DInput", x_dims);
}
}
framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = bool use_cudnn =
...@@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( ...@@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
// conv2d_transpose // conv2d_transpose
DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose, Conv2dTranposeInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose_grad,
Conv2dTranposeGradInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeGradInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(
conv2d_transpose_grad_grad, Conv2dTranposeDoubleGradInferShapeFunctor,
PD_INFER_META(phi::Conv2dTransposeDoubleGradInferMeta));
REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker, ops::Conv2DTransposeOpMaker,
ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>, ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>); ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
REGISTER_OPERATOR( Conv2dTranposeInferShapeFunctor);
conv2d_transpose_grad, ops::ConvTransposeOpGrad, REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad,
ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>, ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>,
ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>); ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>,
REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad); Conv2dTranposeGradInferShapeFunctor);
REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad,
REGISTER_OP_CPU_KERNEL( Conv2dTranposeDoubleGradInferShapeFunctor);
conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
// conv3d_transpose // conv3d_transpose
DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose, Conv3dTranposeInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose_grad,
Conv3dTranposeGradInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeGradInferMeta));
REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
ops::Conv3DTransposeOpMaker, ops::Conv3DTransposeOpMaker,
ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>, ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>); ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); Conv3dTranposeInferShapeFunctor);
REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad,
REGISTER_OP_CPU_KERNEL( Conv3dTranposeGradInferShapeFunctor);
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
// depthwise conv2d_transpose // depthwise conv2d_transpose
DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose,
DepthWiseConv2dTranposeInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose_grad,
DepthWiseConv2dTranposeGradInferShapeFunctor,
PD_INFER_META(phi::ConvTransposeGradInferMeta));
REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker, ops::Conv2DTransposeOpMaker,
ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>, ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>); ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); DepthWiseConv2dTranposeInferShapeFunctor);
REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad,
REGISTER_OP_CPU_KERNEL( DepthWiseConv2dTranposeGradInferShapeFunctor);
depthwise_conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP_VERSION(conv_transpose) REGISTER_OP_VERSION(conv_transpose)
.AddCheckpoint( .AddCheckpoint(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(
groups, filter.dims()[0],
platform::errors::InvalidArgument(
"groups should be error to the 1st dimension of filter. But "
"received groups is %d and filter dimension[0] is %d",
groups, filter.dims()[0]));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
"dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output, filter, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, output, data_layout);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
if (input_grad) {
math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
depthwiseConv(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, filter, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, input_grad, data_layout);
}
if (filter_grad) {
phi::funcs::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*output_grad, *input, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
dilations, filter_grad, data_layout);
}
}
};
} // namespace operators
} // namespace paddle
// conv2d
REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad_grad,
ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<CUDA, double>);
// conv3d
REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<CUDA, double>);
// depthwise conv2d
REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
ops::DepthwiseConvTransposeKernel<CUDA, float>,
ops::DepthwiseConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
...@@ -13,72 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,72 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <string> #include "paddle/fluid/framework/op_kernel_type.h"
#include <vector> #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename DeviceContext, typename T, size_t D>
static void Slice(const framework::ExecutionContext& context,
const Tensor* input, Tensor* out,
const std::vector<int64_t>& begin_vec,
const std::vector<int64_t>& end_vec,
const std::vector<int64_t>& axes_vec) {
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto in_dims = input->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = in_dims[i];
}
std::vector<int64_t> out_shape_vec = phi::vectorize(in_dims);
for (size_t i = 0; i < axes_vec.size(); ++i) {
offsets[axes_vec[i]] = begin_vec[i];
extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
}
framework::DDim out_dims(phi::make_ddim(out_shape_vec));
out->mutable_data<T>(out_dims, context.GetPlace());
auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*input);
auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, out_dims);
EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
offsets, extents);
out->Resize(out_dims);
}
template <typename DeviceContext, typename T, size_t D>
static void Slice(const framework::ExecutionContext& context,
const Tensor* input, Tensor* out, int64_t begin_idx,
int64_t end_idx, int64_t axes) {
std::vector<int64_t> begin_vec = {begin_idx};
std::vector<int64_t> end_vec = {end_idx};
std::vector<int64_t> axes_vec = {axes};
Slice<DeviceContext, T, D>(context, input, out, begin_vec, end_vec, axes_vec);
}
// Define Op classes in .h file so that other conv transpose // Define Op classes in .h file so that other conv transpose
// operator implementations can reuse the code. // operator implementations can reuse the code.
class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
class ConvTransposeOp : public framework::OperatorWithKernel { class ConvTransposeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override; const framework::ExecutionContext& ctx) const override;
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override; const framework::OpKernelType& expected_kernel_type) const override;
}; };
class ConvTransposeOpGrad : public framework::OperatorWithKernel { class ConvTransposeOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { ...@@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel {
class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel { class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override; const framework::ExecutionContext& ctx) const override;
}; };
template <typename DeviceContext, typename T>
class GemmConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped, so it should not be constant pointer
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
int groups = context.Attr<int>("groups");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
auto out_dims = output->dims();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
// input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
// input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
// filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
if (data_layout != framework::DataLayout::kNHWC) {
col_shape_vec[0] = out_dims[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
}
} else {
col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
}
}
DDim col_shape(phi::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
// output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
DDim output_shape =
phi::slice_ddim(output->dims(), 1, output->dims().size());
// input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
// input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
DDim input_matrix_shape;
if (data_layout != framework::DataLayout::kNHWC) {
input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
} else {
input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
}
// filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
DDim filter_matrix_shape;
if (data_layout != framework::DataLayout::kNHWC) {
filter_matrix_shape = {in_dims[1], col_matrix_shape[0]};
} else {
filter_matrix_shape = {in_dims[in_dims.size() - 1], col_matrix_shape[0]};
}
filter.Resize(filter_matrix_shape);
output->mutable_data<T>(context.GetPlace());
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
set_zero(dev_ctx, output, static_cast<T>(0));
int in_step =
(data_layout != framework::DataLayout::kNHWC
? static_cast<int>(in_dims[1]) / groups
: static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
int out_step =
(data_layout != framework::DataLayout::kNHWC
? static_cast<int>(out_dims[1]) / groups
: static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
math::Col2VolFunctor<DeviceContext, T> col2vol;
math::ConcatFunctor<DeviceContext, T> concat_functor;
// convolution transpose: gemm + col2im or col2vol (similar to conv-backward
// on input)
size_t D = input->dims().size();
for (int i = 0; i < batch_size; i++) {
// batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
// batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
std::vector<Tensor> output_batch_vec;
for (int g = 0; g < groups; g++) {
int64_t start = g * in_step;
int64_t end = (g + 1) * in_step;
int axes = (data_layout != framework::DataLayout::kNHWC ? 0 : 1);
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
Tensor in_slice, out_slice;
// col_matrix = filter_slice * input_slice
// of shape (o_c/g * k_h * k_w, h * w)
// or (o_c/g * k_d * k_h * k_w, d * h * w)
if (data_layout != framework::DataLayout::kNHWC) {
in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
&col_matrix, static_cast<T>(0.0));
} else {
Slice<DeviceContext, T, 2>(context, &input_batch, &in_slice, start,
end, axes);
start = g * out_step;
end = (g + 1) * out_step;
axes = D - 2;
if (D == 4U) {
Slice<DeviceContext, T, 3>(context, &output_batch, &out_slice,
start, end, axes);
} else if (D == 5U) {
Slice<DeviceContext, T, 4>(context, &output_batch, &out_slice,
start, end, axes);
}
blas.MatMul(filter_slice, true, in_slice, true, static_cast<T>(1.0),
&col_matrix, static_cast<T>(0.0));
}
if (data_dim == 2U) {
// col2im: col_matrix -> dy
// from (o_c/g * k_h * k_w, h * w) to (o_c/g, o_h, o_w) or (o_h, o_w,
// o_c/g)
col2im(dev_ctx, col, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&out_slice, data_layout);
} else if (data_dim == 3U) {
// col2vol: col_matrix -> dy
// from (o_c/g * k_d * k_h * k_w, d * h * w) to (o_c/g, o_d, o_h, o_w)
// or (o_d, o_h, o_w, o_c/g)
col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice,
data_layout);
}
if (data_layout == framework::DataLayout::kNHWC) {
output_batch_vec.push_back(out_slice);
}
}
if (data_layout == framework::DataLayout::kNHWC) {
concat_functor(dev_ctx, output_batch_vec, static_cast<int>(D - 2),
&output_batch);
}
}
}
};
template <typename DeviceContext, typename T>
class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const std::string data_layout_str =
context.Attr<std::string>("data_format");
const framework::DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
// For filter, we do not use const pointer b/c we will do reshape,
// but we should avoid modifying its value.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
int groups = context.Attr<int>("groups");
std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
auto in_dims = input->dims();
auto filter_dims = filter.dims();
auto out_grad_dims = output_grad->dims();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim in_data_dims;
if (data_layout != framework::DataLayout::kNHWC) {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
} else {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
// input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
// input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
// filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
if (data_layout != framework::DataLayout::kNHWC) {
col_shape_vec[0] = out_grad_dims[1];
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
}
} else {
col_shape_vec[0] = out_grad_dims[out_grad_dims.size() - 1];
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
}
}
DDim col_shape(phi::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
// output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
DDim output_shape =
phi::slice_ddim(output_grad->dims(), 1, output_grad->dims().size());
// input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
// input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
DDim input_matrix_shape;
if (data_layout != framework::DataLayout::kNHWC) {
input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
} else {
input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
}
// filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
DDim filter_matrix_shape;
if (data_layout != framework::DataLayout::kNHWC) {
filter_matrix_shape = {in_dims[1], col_matrix_shape[0] / groups};
} else {
filter_matrix_shape = {in_dims[in_dims.size() - 1],
col_matrix_shape[0] / groups};
}
filter.Resize(filter_matrix_shape);
int in_step =
(data_layout != framework::DataLayout::kNHWC
? static_cast<int>(in_dims[1]) / groups
: static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
// convolution transpose grad on input:
// im2col + gemm (similar to conv-forward)
// input need to compute gradient
auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad || filter_grad) {
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
Tensor filter_grad_;
phi::funcs::SetConstant<DeviceContext, T> set_zero;
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
math::Vol2ColFunctor<DeviceContext, T> vol2col;
math::ConcatFunctor<DeviceContext, T> concat_functor;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (filter_grad) { // filter_grad_ size (i_c, o_c/g, k_h, k_w)
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
}
size_t D = input->dims().size();
for (int i = 0; i < batch_size; i++) {
// batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
// channel_first
// batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
// channel_last
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
if (data_dim == 2U) {
// im2col: dy -> col matrix
// from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
// channel_first
// from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
// channel_last
im2col(dev_ctx, output_grad_batch, dilations, strides,
std::vector<int>{paddings[0], paddings[2], paddings[1],
paddings[3]},
&col, data_layout);
} else if (data_dim == 3U) {
// vol2col: dy -> col_matrix
// from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
// i_w) for channel_first
// from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
// k_w) for channel_last
vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
&col, data_layout);
}
if (input_grad) {
// batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
Tensor input_grad_batch =
input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
// gemm: dx = filter * dy
// (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
// * i_w)
// or
// (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
// i_w) -> (i_c,
// i_d, i_h, i_w)
// gemm: dx = dy^T * filter^T for channel_last
std::vector<Tensor> input_grad_batch_vec;
for (int g = 0; g < groups; g++) {
// input_grad_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
// for channel_first
// input_grad_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
// for channel_last
// filter_slice: (i_c/g, o_c/g * k_h * k_w)
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
// col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
// k_h * k_w, d * h * w)
Tensor col_matrix_slice =
col_matrix.Slice(g * col_step, (g + 1) * col_step);
if (data_layout != framework::DataLayout::kNHWC) {
Tensor input_grad_slice =
input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
blas.MatMul(filter_slice, false, col_matrix_slice, false,
static_cast<T>(1.0), &input_grad_slice,
static_cast<T>(0.0));
} else {
Tensor input_grad_slice;
Slice<DeviceContext, T, 2>(context, &input_grad_batch,
&input_grad_slice, g * in_step,
(g + 1) * in_step, 1);
blas.MatMul(col_matrix_slice, true, filter_slice, true,
static_cast<T>(1.0), &input_grad_slice,
static_cast<T>(0.0));
DDim input_grad_slice_shape;
if (data_dim == 2U) {
input_grad_slice_shape = {in_dims[1], in_dims[2], in_step};
} else {
input_grad_slice_shape = {in_dims[1], in_dims[2], in_dims[3],
in_step};
}
input_grad_slice =
input_grad_slice.Resize(input_grad_slice_shape);
input_grad_batch_vec.push_back(input_grad_slice);
}
}
if (data_layout == framework::DataLayout::kNHWC) {
concat_functor(dev_ctx, input_grad_batch_vec,
static_cast<int>(D - 2), &input_grad_batch);
}
}
if (filter_grad) {
// input batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// gemm: d_filter = x * dy^T
// (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
// * k_w)
// or
// (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
// -> (i_c, o_c * k_d *
// k_h * k_w)
// gemm: d_filter = x^T * dy^T for channel_last
for (int g = 0; g < groups; g++) {
Tensor filter_grad_slice =
filter_grad_.Slice(g * in_step, (g + 1) * in_step);
Tensor col_matrix_slice =
col_matrix.Slice(g * col_step, (g + 1) * col_step);
if (data_layout != framework::DataLayout::kNHWC) {
Tensor in_batch_slice =
in_batch.Slice(g * in_step, (g + 1) * in_step);
blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
static_cast<T>(1.0), &filter_grad_slice,
static_cast<T>(1.0));
} else {
Tensor in_batch_slice;
Slice<DeviceContext, T, 2>(context, &in_batch, &in_batch_slice,
g * in_step, (g + 1) * in_step, 1);
blas.MatMul(in_batch_slice, true, col_matrix_slice, true,
static_cast<T>(1.0), &filter_grad_slice,
static_cast<T>(1.0));
}
}
}
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using NPUDeviceContext = platform::NPUDeviceContext; using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T> template <typename T>
...@@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> { ...@@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims); std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, phi::UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
in_data_dims, stride, ksize); in_data_dims, stride, ksize);
// construct NPU attr // construct NPU attr
std::vector<int> strides(4, 1); std::vector<int> strides(4, 1);
...@@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> { ...@@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
framework::DDim filter_data_dims = framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size()); phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims); std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
std::vector<int> strides_vec(4, 1); std::vector<int> strides_vec(4, 1);
std::vector<int> dilations_vec(4, 1); std::vector<int> dilations_vec(4, 1);
......
...@@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
// target_len == 2 || target_len == 4 // target_len == 2 || target_len == 4
inline std::vector<int> vector_extend(const std::vector<int>& src, inline std::vector<int> vector_extend(const std::vector<int>& src,
int target_len) { int target_len) {
...@@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel<T> { ...@@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel<T> {
framework::DDim filter_data_dims = framework::DDim filter_data_dims =
phi::slice_ddim(filter.dims(), 2, filter.dims().size()); phi::slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims); std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int img_yc = static_cast<int>(input->dims()[1]); const int img_yc = static_cast<int>(input->dims()[1]);
...@@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel<T> { ...@@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel<T> {
framework::DDim filter_data_dims = framework::DDim filter_data_dims =
phi::slice_ddim(filter.dims(), 2, filter.dims().size()); phi::slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims); std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int img_yc = static_cast<int>(input->dims()[1]); const int img_yc = static_cast<int>(input->dims()[1]);
......
...@@ -243,8 +243,6 @@ class ConcatFunctor<platform::MLUDeviceContext, T> { ...@@ -243,8 +243,6 @@ class ConcatFunctor<platform::MLUDeviceContext, T> {
const int axis_t = axis; const int axis_t = axis;
const int ins_size_t = ins_size; const int ins_size_t = ins_size;
auto place = context.GetPlace();
output->mutable_data<T>(place);
// mlu should do sth // mlu should do sth
// init ins tensors // init ins tensors
...@@ -295,7 +293,6 @@ class SplitFunctor<platform::MLUDeviceContext, T> { ...@@ -295,7 +293,6 @@ class SplitFunctor<platform::MLUDeviceContext, T> {
std::vector<cnnlTensorDescriptor_t> desc_vector; std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < out_size; i++) { for (size_t i = 0; i < out_size; i++) {
(*outputs)[i]->Resize(outs_dims[i]); (*outputs)[i]->Resize(outs_dims[i]);
(*outputs)[i]->mutable_data<T>(context.GetPlace());
output_descs.emplace_back( output_descs.emplace_back(
MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
ToCnnlDataType((*outputs)[i]->dtype()))); ToCnnlDataType((*outputs)[i]->dtype())));
......
...@@ -12,9 +12,12 @@ ...@@ -12,9 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker { ...@@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker {
virtual std::string GetOpType() const { return "Reduce frobenius_norm"; } virtual std::string GetOpType() const { return "Reduce frobenius_norm"; }
}; };
DECLARE_INFER_SHAPE_FUNCTOR(frobenius_norm, FrobeniusNormInferShapeFunctor,
PD_INFER_META(phi::ReduceInferMetaBase));
REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker, REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker,
ops::FrobeniusNormOpGradMaker<paddle::framework::OpDesc>, ops::FrobeniusNormOpGradMaker<paddle::framework::OpDesc>,
ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>); ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>,
FrobeniusNormInferShapeFunctor);
REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp); REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp);
REGISTER_OP_CPU_KERNEL(frobenius_norm,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::FrobeniusNormFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::FrobeniusNormFunctor>);
template <typename T>
using CPUFrobeniusNormGradKernel =
ops::FrobeniusNormGradKernel<paddle::platform::CPUDeviceContext, T,
ops::FrobeniusNormGradFunctor>;
REGISTER_OP_CPU_KERNEL(frobenius_norm_grad, CPUFrobeniusNormGradKernel<float>,
CPUFrobeniusNormGradKernel<double>);
...@@ -117,7 +117,7 @@ endif() ...@@ -117,7 +117,7 @@ endif()
cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
# seperate init from device_context to avoid cycle dependencies # seperate init from device_context to avoid cycle dependencies
cc_library(init SRCS init.cc DEPS device_context custom_kernel) cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
# memcpy depends on device_context, here add deps individually for # memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
......
...@@ -13,7 +13,7 @@ IF(WITH_IPU) ...@@ -13,7 +13,7 @@ IF(WITH_IPU)
"ipu_device.cc" "ipu_device.cc"
) )
cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper) cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist)
cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce) cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC}) add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
add_dependencies(paddle_ipu ipu_backend) add_dependencies(paddle_ipu ipu_backend)
......
...@@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() { ...@@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() {
IpuBackend::IpuBackend() { IpuBackend::IpuBackend() {
compiler_ = std::make_unique<Compiler>(); compiler_ = std::make_unique<Compiler>();
executor_ = std::make_unique<Executor>(); executor_ = std::make_unique<Executor>();
timer_ = std::make_unique<platform::Timer>();
} }
IpuBackend::~IpuBackend() { IpuBackend::~IpuBackend() {
...@@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph, ...@@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph,
const std::vector<std::string>& feed_list, const std::vector<std::string>& feed_list,
const std::vector<std::string>& fetch_list) { const std::vector<std::string>& fetch_list) {
VLOG(10) << "enter IpuBackend::Compile"; VLOG(10) << "enter IpuBackend::Compile";
is_compiled_ = false;
compiler_->Prepare(graph); compiler_->Prepare(graph);
compiler_->InitInputs(feed_list); compiler_->InitInputs(feed_list);
compiler_->LowerConstants(scope_); compiler_->LowerConstants(scope_);
...@@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph, ...@@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph,
if (ipu_strategy_->is_training) { if (ipu_strategy_->is_training) {
compiler_->LowerOptimizer(scope_); compiler_->LowerOptimizer(scope_);
} }
if (!ipu_strategy_->onnx_dump_path.empty()) {
SaveModelProto(ipu_strategy_->onnx_dump_path);
}
executor_->SetCompilerResources(compiler_->GetResources()); executor_->SetCompilerResources(compiler_->GetResources());
executor_->Prepare(compiler_->GetModelProto());
is_compiled_ = true; is_compiled_ = true;
// when call compile, means a new graph
is_prepared_ = false;
VLOG(10) << "leave IpuBackend::Compile"; VLOG(10) << "leave IpuBackend::Compile";
} }
void IpuBackend::Run(const std::vector<const Tensor*>& inputs, void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
const std::vector<Tensor*>& outputs, const std::vector<Tensor*>& outputs,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
Prepare();
timer_->Start(); timer_->Start();
executor_->Run(inputs, outputs, ctx); executor_->Run(inputs, outputs, ctx);
timer_->Pause(); timer_->Pause();
VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)"; VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
} }
void IpuBackend::Prepare() { void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
if (!is_prepared_) {
executor_->Prepare(compiler_->GetModelProto());
timer_.reset(new platform::Timer());
is_prepared_ = true;
}
}
void IpuBackend::Detach() { executor_->Detach(); } void IpuBackend::Detach() { executor_->Detach(); }
...@@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { ...@@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
} }
void IpuBackend::SaveModelProto(const std::string& path) { void IpuBackend::SaveModelProto(const std::string& path) {
if (ipu_strategy_->is_training && is_prepared_) { if (ipu_strategy_->is_training && is_compiled_) {
executor_->SaveModelToHost(path); executor_->SaveModelToHost(path);
} else if (is_compiled_) {
compiler_->SaveModelProtoNoCheck(path);
} else { } else {
LOG(WARNING) << "Model is empty"; compiler_->SaveModelProtoNoCheck(path);
} }
} }
......
...@@ -60,6 +60,9 @@ class IpuBackend { ...@@ -60,6 +60,9 @@ class IpuBackend {
const std::vector<Tensor *> &outputs, const std::vector<Tensor *> &outputs,
const framework::ExecutionContext &ctx); const framework::ExecutionContext &ctx);
// Sync weights from IPU while training
void WeightsToHost();
// detach IPU manually // detach IPU manually
void Detach(); void Detach();
...@@ -76,22 +79,17 @@ class IpuBackend { ...@@ -76,22 +79,17 @@ class IpuBackend {
void SaveModelProto(const std::string &path); void SaveModelProto(const std::string &path);
private: private:
void Prepare();
private:
std::unique_ptr<Compiler> compiler_;
std::unique_ptr<Executor> executor_;
bool is_compiled_ = false;
bool is_prepared_ = false;
// not own // not own
const Scope *scope_ = nullptr; const Scope *scope_ = nullptr;
const IpuStrategy *ipu_strategy_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr;
private: // own
// time record for IpuBackend::Run std::unique_ptr<Compiler> compiler_;
std::unique_ptr<Executor> executor_;
std::unique_ptr<platform::Timer> timer_; std::unique_ptr<platform::Timer> timer_;
bool is_compiled_ = false;
DISABLE_COPY_AND_ASSIGN(IpuBackend); DISABLE_COPY_AND_ASSIGN(IpuBackend);
}; };
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <popart/adaptive.hpp> #include <popart/adaptive.hpp>
#include <popart/optimizer.hpp> #include <popart/optimizer.hpp>
#include <popart/sgd.hpp> #include <popart/sgd.hpp>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_utils.h" #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
...@@ -25,13 +26,20 @@ namespace paddle { ...@@ -25,13 +26,20 @@ namespace paddle {
namespace platform { namespace platform {
namespace ipu { namespace ipu {
popart::AdamMode AdamModeFromStr(const std::string& str) { popart::AdamMode AdamModeFromStr(const std::string& str,
const bool& use_no_bias_optimizer) {
if (str == "adam") { if (str == "adam") {
return popart::AdamMode::Adam; if (!use_no_bias_optimizer)
return popart::AdamMode::Adam;
else
return popart::AdamMode::AdamNoBias;
} else if (str == "adamax") { } else if (str == "adamax") {
return popart::AdamMode::AdaMax; return popart::AdamMode::AdaMax;
} else if (str == "lamb") { } else if (str == "lamb") {
return popart::AdamMode::Lamb; if (!use_no_bias_optimizer)
return popart::AdamMode::Lamb;
else
return popart::AdamMode::LambNoBias;
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Uknown AdamMode: %s, AdamMode must be one of these values: adam, " "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
...@@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) { ...@@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
} }
} }
popart::DataType DataTypeFromStr(const std::string& str) {
if (str == "FLOAT") {
return popart::DataType::FLOAT;
} else if (str == "FLOAT16") {
return popart::DataType::FLOAT16;
} else {
PADDLE_THROW(
platform::errors::Unimplemented("Unsupported DataType: %s", str));
}
}
template <typename T> template <typename T>
T GetAttrAllowNull(std::string attr, OpDesc* op_desc) { T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
if (op_desc->HasAttr(attr)) { if (op_desc->HasAttr(attr)) {
...@@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) { ...@@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) {
builder_ = popart::Builder::create(); builder_ = popart::Builder::create();
resources_ = std::make_unique<CompilerResources>(); resources_ = std::make_unique<CompilerResources>();
graph_helper_ = std::make_unique<GraphHelper>(graph); graph_helper_ = std::make_unique<GraphHelper>(graph);
// Set the flag of set_amp_for_all_
for (auto* node : graph_helper_->sorted_ops) {
auto* op_desc = node->Op();
auto op_type = op_desc->Type();
if (op_type == "popart_matmul") {
if (op_desc->HasAttr(sAvailMemAttribute)) {
set_amp_for_all_ = false;
return;
}
}
}
} }
void Compiler::RegisterOpFunc() { void Compiler::RegisterOpFunc() {
...@@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() { ...@@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() {
auto debug_context = BuildDebugContext(op_desc); \ auto debug_context = BuildDebugContext(op_desc); \
auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1(); \ auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1(); \
auto aiOnnxOpset = builder_->aiOnnxOpset11(); \ auto aiOnnxOpset = builder_->aiOnnxOpset11(); \
PushNameScope(op_desc); \
auto output_ids = OnnxImpl(inputs Args, debug_context); \ auto output_ids = OnnxImpl(inputs Args, debug_context); \
PopNameScope(op_desc); \
SetIpuIndexStage(output_ids, op_desc); \ SetIpuIndexStage(output_ids, op_desc); \
SetAMPAttributes(output_ids, op_desc); \ SetAMPAttributes(output_ids, op_desc); \
SetSerializeAttributes(output_ids, op_desc); \ SetSerializeAttributes(output_ids, op_desc); \
...@@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) { ...@@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) {
popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()), popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
shape); shape);
const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info)); const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
PushNameScope(op_desc);
popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
PopNameScope(op_desc);
SetIpuIndexStage(result, op_desc); SetIpuIndexStage(result, op_desc);
resources_->tensors.emplace(tensor_name, result); resources_->tensors.emplace(tensor_name, result);
} }
...@@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) { ...@@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) {
VLOG(10) << "found existed one, skip lowering Weight: " << var_name; VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
continue; continue;
} }
if (var_name.rfind("learning_rate", 0) == 0) {
VLOG(10) << "skip learning_rate_var: " << var_name;
continue;
}
VLOG(10) << "lowering weight: " << var_name; VLOG(10) << "lowering weight: " << var_name;
auto var = scope->FindVar(var_name); auto var = scope->FindVar(var_name);
...@@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) { ...@@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) {
} }
popart::TensorInfo tensor_info(dtype, shape); popart::TensorInfo tensor_info(dtype, shape);
popart::ConstVoidData const_data{tensor.data(), tensor_info}; popart::ConstVoidData const_data{tensor.data(), tensor_info};
popart::TensorId result = if (!node->outputs.empty()) {
builder_->addInitializedInputTensor(const_data, var_name); auto op_node = node->outputs[0];
resources_->tensors.emplace(var_name, result); PushNameScope(op_node->Op());
resources_->weights.push_back(result); popart::TensorId result =
builder_->addInitializedInputTensor(const_data, var_name);
PopNameScope(op_node->Op());
resources_->tensors.emplace(var_name, result);
resources_->weights.push_back(var_name);
}
} }
} }
} }
...@@ -298,7 +341,10 @@ void Compiler::LowerBody() { ...@@ -298,7 +341,10 @@ void Compiler::LowerBody() {
} else if (op_type == "popart_checkpointoutput") { } else if (op_type == "popart_checkpointoutput") {
auto inputs = GetOpInputs(op_desc); auto inputs = GetOpInputs(op_desc);
auto outputs = GetOpOutputs(op_desc); auto outputs = GetOpOutputs(op_desc);
PushNameScope(op_desc);
auto output_ids = builder_->checkpointOutput(inputs); auto output_ids = builder_->checkpointOutput(inputs);
PopNameScope(op_desc);
SetIpuIndexStage(output_ids, op_desc);
InsertTensors(outputs, output_ids); InsertTensors(outputs, output_ids);
} else if (op_type == "popart_custom_op") { } else if (op_type == "popart_custom_op") {
auto inputs = GetOpInputs(op_desc); auto inputs = GetOpInputs(op_desc);
...@@ -313,9 +359,11 @@ void Compiler::LowerBody() { ...@@ -313,9 +359,11 @@ void Compiler::LowerBody() {
BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
VLOG(10) << "Build graph from custom op: " << __op_type; VLOG(10) << "Build graph from custom op: " << __op_type;
auto it = custom_ops_.find(__op_type); auto it = custom_ops_.find(__op_type);
PushNameScope(op_desc);
auto output_ids = auto output_ids =
builder_->customOp(it->second.popart_op, it->second.popart_op.version, builder_->customOp(it->second.popart_op, it->second.popart_op.version,
inputs, outputs.size(), attributes, debug_context); inputs, outputs.size(), attributes, debug_context);
PopNameScope(op_desc);
SetIpuIndexStage(output_ids, op_desc); SetIpuIndexStage(output_ids, op_desc);
InsertTensors(outputs, output_ids); InsertTensors(outputs, output_ids);
} else if (op_type == "popart_printtensor") { } else if (op_type == "popart_printtensor") {
...@@ -325,8 +373,10 @@ void Compiler::LowerBody() { ...@@ -325,8 +373,10 @@ void Compiler::LowerBody() {
auto print_gradient = auto print_gradient =
BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
PushNameScope(op_desc);
auto output_ids = builder_->aiGraphcoreOpset1().printtensor( auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
inputs, print_gradient, debug_context, title); inputs, print_gradient, debug_context, title);
PopNameScope(op_desc);
SetIpuIndexStage(output_ids, op_desc); SetIpuIndexStage(output_ids, op_desc);
InsertTensors(outputs, output_ids); InsertTensors(outputs, output_ids);
} else { } else {
...@@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) { ...@@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) {
resources_->with_lr_sched = false; resources_->with_lr_sched = false;
} }
VLOG(10) << "Set initial lr: " << resources_->lr; VLOG(10) << "Set initial lr: " << resources_->lr;
auto loss_scaling = ipu_strategy_->loss_scaling;
// Get the type of optimizer
auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type")); auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
// Set weight decay by tensor names for Lamb
auto weight_decay_vars = BOOST_GET_CONST(
std::vector<std::string>, op_desc->GetAttr("weight_decay_vars"));
auto weight_decay_values = BOOST_GET_CONST(
std::vector<float>, op_desc->GetAttr("weight_decay_values"));
// Get the maximum permissible value for gradient clipping
std::vector<popart::ClipNormSettings> clip_norm_settings = {};
if (op_desc->HasAttr("clip_norm")) {
auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm"));
clip_norm_settings.push_back(
popart::ClipNormSettings::clipAllWeights(clip_norm));
VLOG(10) << "Set the global gradient clipping with the maximum "
"permissible value: "
<< clip_norm;
}
// Values from ipu_strategy
auto loss_scaling = ipu_strategy_->loss_scaling;
auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type);
auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type);
auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type);
if (type == "sgd") { if (type == "sgd") {
auto weight_decay = auto weight_decay =
BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
...@@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) { ...@@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) {
resources_->optimizer_fn = [=](float lr) { resources_->optimizer_fn = [=](float lr) {
return std::make_unique<popart::SGD>( return std::make_unique<popart::SGD>(
popart::OptimizerValue(lr, false), popart::OptimizerValue(lr, false),
popart::OptimizerValue(weight_decay, true), popart::OptimizerValue(weight_decay, false),
popart::OptimizerValue(momentum, true), popart::OptimizerValue(momentum, true),
popart::SGD::getUnsetDampening(), popart::SGD::getUnsetDampening(),
popart::SGD::getUnsetVelocityScaling(), popart::SGD::getUnsetVelocityScaling(),
popart::OptimizerValue(loss_scaling, true)); popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
}; };
resources_->eval_optimizer = std::make_unique<popart::SGD>(
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(),
popart::SGD::getUnsetVelocityScaling(),
popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
} else if (type == "adam") { } else if (type == "adam") {
auto weight_decay = auto weight_decay =
BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
...@@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) { ...@@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) {
VLOG(10) << "set max_weight_norm: " << mwn; VLOG(10) << "set max_weight_norm: " << mwn;
auto adam_mode_ = auto adam_mode_ =
BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode")); BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
auto adam_mode = AdamModeFromStr(adam_mode_); auto adam_mode =
auto weight_decay_mode_ = AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
if (weight_decay_mode_.empty()) {
weight_decay_mode_ = BOOST_GET_CONST(
std::string, op_desc->GetAttr("weight_decay_mode"));
}
auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
resources_->optimizer_fn = [=](float lr) { resources_->optimizer_fn = [=](float lr) {
return std::make_unique<popart::Adam>( if (adam_mode == popart::AdamMode::Lamb ||
popart::OptimizerValue(lr, false), adam_mode == popart::AdamMode::LambNoBias) {
popart::OptimizerValue(weight_decay, true), const std::map<std::string, std::pair<float, bool>>
popart::OptimizerValue(beta1, true), optimizer_value = {{"defaultLearningRate", {lr, false}},
popart::OptimizerValue(beta2, true), {"defaultBeta1", {beta1, false}},
{"defaultBeta2", {beta2, false}},
{"defaultEps", {eps, true}},
{"lossScaling", {loss_scaling, true}},
{"defaultMaxWeightNorm", {mwn, true}}};
auto optimizer_instance = std::make_unique<popart::Adam>(
optimizer_value, adam_mode, weight_decay_mode,
popart::DataType::UNDEFINED, accl1_type, accl2_type,
clip_norm_settings);
for (int i = 0; i < weight_decay_vars.size(); i++) {
optimizer_instance->insertSpecific(
weight_decay_vars[i],
{{"weightDecay", {weight_decay_values[i], false}}});
VLOG(10) << "Set Tensor " << weight_decay_vars[i]
<< " weight decay as " << weight_decay_values[i];
}
return optimizer_instance;
} else {
return std::make_unique<popart::Adam>(
popart::OptimizerValue(lr, false),
popart::OptimizerValue(weight_decay, false),
popart::OptimizerValue(beta1, false),
popart::OptimizerValue(beta2, false),
popart::OptimizerValue(eps, true),
popart::OptimizerValue(loss_scaling, true),
popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
popart::DataType::UNDEFINED, accl1_type, accl2_type,
clip_norm_settings);
}
};
if (adam_mode == popart::AdamMode::Lamb ||
adam_mode == popart::AdamMode::LambNoBias) {
const std::map<std::string, std::pair<float, bool>> optimizer_value =
{{"defaultLearningRate", {0.0, false}},
{"defaultBeta1", {beta1, false}},
{"defaultBeta2", {beta2, false}},
{"defaultEps", {eps, true}},
{"lossScaling", {loss_scaling, true}},
{"defaultMaxWeightNorm", {mwn, true}}};
auto eval_optimizer = std::make_unique<popart::Adam>(
optimizer_value, adam_mode, weight_decay_mode,
popart::DataType::UNDEFINED, popart::DataType::FLOAT,
popart::DataType::FLOAT, clip_norm_settings);
for (int i = 0; i < weight_decay_vars.size(); i++) {
eval_optimizer->insertSpecific(weight_decay_vars[i],
{{"weightDecay", {0.0, false}}});
}
resources_->eval_optimizer = std::move(eval_optimizer);
} else {
resources_->eval_optimizer = std::make_unique<popart::Adam>(
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(beta1, false),
popart::OptimizerValue(beta2, false),
popart::OptimizerValue(eps, true), popart::OptimizerValue(eps, true),
popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(loss_scaling, true),
popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
popart::DataType::UNDEFINED, popart::DataType::FLOAT, popart::DataType::UNDEFINED, popart::DataType::FLOAT,
popart::DataType::FLOAT); popart::DataType::FLOAT, clip_norm_settings);
}; }
} else if (type == "adaptive") { } else if (type == "adaptive") {
auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
...@@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) { ...@@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) {
auto adaptive_mode_ = auto adaptive_mode_ =
BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode")); BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_); auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
auto weight_decay_mode_ = auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); if (weight_decay_mode_.empty()) {
weight_decay_mode_ = BOOST_GET_CONST(
std::string, op_desc->GetAttr("weight_decay_mode"));
}
auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
resources_->optimizer_fn = [=](float lr) { resources_->optimizer_fn = [=](float lr) {
return std::make_unique<popart::Adaptive>( return std::make_unique<popart::Adaptive>(
popart::OptimizerValue(lr, false), popart::OptimizerValue(lr, false),
popart::OptimizerValue(weight_decay, true), popart::OptimizerValue(weight_decay, false),
popart::OptimizerValue(alpha, true), popart::OptimizerValue(alpha, true),
popart::OptimizerValue(momentum, true), popart::OptimizerValue(momentum, true),
popart::OptimizerValue(eps, true), popart::OptimizerValue(eps, true),
popart::OptimizerValue(loss_scaling, true), adaptive_mode, popart::OptimizerValue(loss_scaling, true), adaptive_mode,
weight_decay_mode, popart::DataType::UNDEFINED, weight_decay_mode, popart::DataType::UNDEFINED, accl1_type,
popart::DataType::FLOAT, popart::DataType::FLOAT, accl2_type, accl3_type);
popart::DataType::FLOAT);
}; };
resources_->eval_optimizer = std::make_unique<popart::Adaptive>(
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(0.0, false),
popart::OptimizerValue(alpha, true),
popart::OptimizerValue(momentum, true),
popart::OptimizerValue(eps, true),
popart::OptimizerValue(loss_scaling, true), adaptive_mode,
weight_decay_mode, popart::DataType::UNDEFINED,
popart::DataType::FLOAT, popart::DataType::FLOAT,
popart::DataType::UNDEFINED);
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"optimizer %s is not implemented", type)); "optimizer %s is not implemented", type));
...@@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id, ...@@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
const OpDesc* op_desc) { const OpDesc* op_desc) {
VLOG(10) << "enter Compiler::SetAMPAttributes"; VLOG(10) << "enter Compiler::SetAMPAttributes";
if (op_desc->Type() == "popart_matmul") { if (op_desc->Type() == "popart_matmul") {
auto amp = ipu_strategy_->available_memory_proportion; if (set_amp_for_all_) {
if (amp > 0.0f && amp <= 1.0) { auto amp = ipu_strategy_->available_memory_proportion;
builder_->setAvailableMemoryProportion(tensor_id, amp); if (amp < 0.0f || amp > 1.0) {
PADDLE_THROW(platform::errors::InvalidArgument(
"AvailableMemoryProportion %f is invalid, which should be set 0 <= "
"amp <= 1",
amp));
}
if (amp > 0.0f) {
builder_->setAvailableMemoryProportion(tensor_id, amp);
}
} else {
if (op_desc->HasAttr(sAvailMemAttribute)) {
auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
if (amp < 0.0f || amp > 1.0) {
PADDLE_THROW(platform::errors::InvalidArgument(
"AvailableMemoryProportion %f is invalid, which should be set 0 "
"<= amp <= 1",
amp));
}
if (amp > 0.0f) {
builder_->setAvailableMemoryProportion(tensor_id, amp);
VLOG(10) << "set available_memory_proportion for tensor: "
<< tensor_id << " as " << amp;
}
}
} }
} }
VLOG(10) << "leave Compiler::SetAMPAttributes"; VLOG(10) << "leave Compiler::SetAMPAttributes";
...@@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) { ...@@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
return popart::DebugContext(op_identify_id); return popart::DebugContext(op_identify_id);
} }
void Compiler::PushNameScope(const OpDesc* op) {
auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
if (op_namescope == "/") {
return;
}
if (!op_namescope.empty()) {
op_namescope.pop_back();
}
if (!op_namescope.empty()) {
op_namescope.erase(op_namescope.begin());
}
VLOG(10) << "name_scope is: " << op_namescope;
builder_->pushNameScope(op_namescope);
}
void Compiler::PopNameScope(const OpDesc* op) {
auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
if (op_namescope == "/") {
return;
}
builder_->popNameScope();
}
} // namespace ipu } // namespace ipu
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -50,6 +50,8 @@ struct CompilerResources { ...@@ -50,6 +50,8 @@ struct CompilerResources {
using OptimizerFn = using OptimizerFn =
std::function<std::unique_ptr<popart::Optimizer>(float lr)>; std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
OptimizerFn optimizer_fn; OptimizerFn optimizer_fn;
// The eval mode of optimizer in training
std::unique_ptr<popart::Optimizer> eval_optimizer;
public: public:
popart::Optimizer *Optimizer() { return optimizer.get(); } popart::Optimizer *Optimizer() { return optimizer.get(); }
...@@ -110,6 +112,7 @@ class Compiler { ...@@ -110,6 +112,7 @@ class Compiler {
void RegisterOpFunc(); void RegisterOpFunc();
std::vector<std::string> GetOpInputs(const OpDesc *op); std::vector<std::string> GetOpInputs(const OpDesc *op);
const std::vector<std::string> &GetOpOutputs(const OpDesc *op); const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
const std::string GetNameScope(const OpDesc *op);
popart::DebugContext BuildDebugContext(const OpDesc *op); popart::DebugContext BuildDebugContext(const OpDesc *op);
void InsertTensors(const std::vector<std::string> &output_names, void InsertTensors(const std::vector<std::string> &output_names,
...@@ -126,6 +129,8 @@ class Compiler { ...@@ -126,6 +129,8 @@ class Compiler {
const OpDesc *op_desc); const OpDesc *op_desc);
void SetSerializeAttributes(const std::string &tensor_id, void SetSerializeAttributes(const std::string &tensor_id,
const OpDesc *op_desc); const OpDesc *op_desc);
void PushNameScope(const OpDesc *op);
void PopNameScope(const OpDesc *op);
private: private:
std::unique_ptr<popart::Builder> builder_; std::unique_ptr<popart::Builder> builder_;
...@@ -137,6 +142,14 @@ class Compiler { ...@@ -137,6 +142,14 @@ class Compiler {
const IpuStrategy *ipu_strategy_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr;
std::map<std::string, IpuCustomOpIdentifier> custom_ops_; std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
// Used to choose the way to set amp for Ops
// If anyone op has the attr sAvailMemAttribute, the
// available_memory_proportion from ipu_strategy
// will be ignored and the Ops are set by their own sAvailMemAttribute. Else,
// all relevant Ops will be set by
// the available_memory_proportion from ipu_strategy.
bool set_amp_for_all_ = true;
}; };
} // namespace ipu } // namespace ipu
......
...@@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) { ...@@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) {
WeightsFromPaddle(); WeightsFromPaddle();
VLOG(10) << "Copy weights from paddle to popart...done"; VLOG(10) << "Copy weights from paddle to popart...done";
VLOG(10) << "Copy weights from host to device..."; if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
session_->weightsFromHost(); VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
VLOG(10) << "Copy weights from host to device...done"; session_->setRandomSeed(ipu_strategy_->random_seed);
if (ipu_strategy_->save_init_onnx) {
session_->modelToHost("test_init.onnx");
} }
// init run step
step_ = 0;
} }
void Executor::Run(const std::vector<const Tensor *> &inputs, void Executor::Run(const std::vector<const Tensor *> &inputs,
...@@ -120,11 +115,17 @@ void Executor::Run(const std::vector<const Tensor *> &inputs, ...@@ -120,11 +115,17 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
VLOG(10) << "Prepared inputs/anchors"; VLOG(10) << "Prepared inputs/anchors";
if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) { if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
VLOG(10) << "Update learning_rate"; popart::Optimizer *optimizer;
auto new_lr = if (ipu_strategy_->runtime_options.enable_eval) {
GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var); VLOG(10) << "Switch optimizer to eval mode";
VLOG(10) << "New Lr: " << new_lr; optimizer = compiler_resources_->eval_optimizer.get();
auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr); } else {
VLOG(10) << "Update learning_rate";
auto new_lr =
GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
VLOG(10) << "New Lr: " << new_lr;
optimizer = compiler_resources_->UpdateOptimizer(new_lr);
}
auto *session = dynamic_cast<popart::TrainingSession *>(session_.get()); auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
session->updateOptimizerFromHost(optimizer); session->updateOptimizerFromHost(optimizer);
} }
...@@ -133,15 +134,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs, ...@@ -133,15 +134,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
VLOG(10) << "Running..."; VLOG(10) << "Running...";
session_->run(stepio); session_->run(stepio);
VLOG(10) << "Running...done"; VLOG(10) << "Running...done";
}
step_++; void Executor::WeightsToHost() {
if (ipu_strategy_->is_training && if (ipu_strategy_->is_training && session_) {
step_ % ipu_strategy_->save_per_n_step == 0) {
session_->weightsToHost();
WeightsToPaddle(); WeightsToPaddle();
if (ipu_strategy_->save_onnx_checkpoint) { } else {
session_->modelToHost("test_last" + std::to_string(step_) + ".onnx"); LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
}
} }
} }
...@@ -153,6 +152,7 @@ void Executor::AcquireDevice() { ...@@ -153,6 +152,7 @@ void Executor::AcquireDevice() {
} }
bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
bool enable_distribution = ipu_strategy_->enable_distribution;
if (use_ipu_model) { if (use_ipu_model) {
std::map<std::string, std::string> deviceOpts{ std::map<std::string, std::string> deviceOpts{
{ {
...@@ -162,6 +162,16 @@ void Executor::AcquireDevice() { ...@@ -162,6 +162,16 @@ void Executor::AcquireDevice() {
}; };
device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
deviceOpts); deviceOpts);
} else if (enable_distribution) {
auto ipus_per_replica = ipu_strategy_->num_ipus /
ipu_strategy_->popart_options.replicatedGraphCount;
auto device_id = popdist_get_device(ipus_per_replica);
device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
device_id);
PADDLE_ENFORCE_NOT_NULL(
device_, platform::errors::Unavailable(
"Can't attach IPU in distribution, ipu_num = %d.",
RequestIpus(ipu_strategy_->num_ipus)));
} else { } else {
device_ = device_ =
popart::DeviceManager::createDeviceManager().acquireAvailableDevice( popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
...@@ -185,28 +195,29 @@ void Executor::SetWeightsIO() { ...@@ -185,28 +195,29 @@ void Executor::SetWeightsIO() {
auto opt_type = compiler_resources_->optimizer_type; auto opt_type = compiler_resources_->optimizer_type;
VLOG(10) << "SetWeightsIO for " << opt_type; VLOG(10) << "SetWeightsIO for " << opt_type;
auto pre_post_fix = GetOptPrePostfix(opt_type); auto pre_post_fix = GetOptPrePostfix(opt_type);
for (const auto &weight_id : compiler_resources_->weights) { for (const auto &weight_pd : compiler_resources_->weights) {
for (const auto &pair : pre_post_fix) { for (const auto &pair : pre_post_fix) {
// pair.first : popart prefix, pair.second : paddle postfix // pair.first : popart prefix, pair.second : paddle postfix
auto popart_var_name = pair.first + weight_id; auto weight_pop = compiler_resources_->tensors[weight_pd];
auto paddle_var_name = weight_id + pair.second; auto popart_var = pair.first + weight_pop;
auto paddle_var = weight_pd + pair.second;
if (scope_->FindVar(paddle_var_name) == nullptr) { if (scope_->FindVar(paddle_var) == nullptr) {
continue; continue;
} }
if (!session_->hasInfo(popart_var)) {
if (!session_->hasInfo(popart_var_name)) {
continue; continue;
} }
auto var = scope_->GetVar(paddle_var_name); VLOG(10) << "Connect paddle weight: " << paddle_var
<< " with popart weight: " << popart_var;
auto var = scope_->GetVar(paddle_var);
auto data_ptr = var->GetMutable<framework::LoDTensor>()->data(); auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
auto tensor_info = session_->getInfo(popart_var);
auto tensor_info = session_->getInfo(popart_var_name); executor_resources_->weights_io.insert(popart_var,
executor_resources_->weights_io.insert(popart_var_name,
{data_ptr, tensor_info}); {data_ptr, tensor_info});
executor_resources_->weights_and_opt_state.emplace_back( executor_resources_->weights_and_opt_state.emplace_back(
std::make_pair(popart_var_name, paddle_var_name)); std::make_pair(popart_var, paddle_var));
} }
} }
} }
...@@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) { ...@@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
void Executor::WeightsFromPaddle() { void Executor::WeightsFromPaddle() {
ConvertWeights(true); ConvertWeights(true);
session_->writeWeights(executor_resources_->weights_io); session_->writeWeights(executor_resources_->weights_io);
session_->weightsFromHost();
} }
// |-----------------------------------------------------| // |-----------------------------------------------------|
...@@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() { ...@@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() {
// Paddle -> halfToFloat: cast then save to paddle // Paddle -> halfToFloat: cast then save to paddle
// Popart -> Paddle: copy from paddle to popart // Popart -> Paddle: copy from paddle to popart
void Executor::WeightsToPaddle() { void Executor::WeightsToPaddle() {
session_->weightsToHost();
session_->readWeights(executor_resources_->weights_io); session_->readWeights(executor_resources_->weights_io);
ConvertWeights(false); ConvertWeights(false);
} }
void Executor::SaveModelToHost(const std::string &path) { void Executor::SaveModelToHost(const std::string &path) {
if (session_) { if (session_) {
session_->weightsToHost();
WeightsToPaddle(); WeightsToPaddle();
session_->modelToHost(path); session_->modelToHost(path);
} else { } else {
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <popart/patterns/patterns.hpp> #include <popart/patterns/patterns.hpp>
#include <popart/session.hpp> #include <popart/session.hpp>
#include <popart/tensorinfo.hpp> #include <popart/tensorinfo.hpp>
#include <popdist/popdist_poplar.hpp>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -36,8 +37,7 @@ struct ExecutorResources { ...@@ -36,8 +37,7 @@ struct ExecutorResources {
// map<tensor_id, paddle_var_ptr> // map<tensor_id, paddle_var_ptr>
popart::WeightsIO weights_io; popart::WeightsIO weights_io;
// <popart_var, paddle_var> pairs, include weights and optimizer states // <popart_var, paddle_var> pairs, include weights and optimizer states
std::vector<std::pair<popart::TensorId, popart::TensorId>> std::vector<std::pair<popart::TensorId, std::string>> weights_and_opt_state;
weights_and_opt_state;
}; };
class Executor { class Executor {
...@@ -53,14 +53,12 @@ class Executor { ...@@ -53,14 +53,12 @@ class Executor {
const std::vector<Tensor *> &outputs, const std::vector<Tensor *> &outputs,
const framework::ExecutionContext &ctx); const framework::ExecutionContext &ctx);
// sync weights from popart to paddle
void WeightsToHost();
// detach IPU // detach IPU
void Detach(); void Detach();
void SetWeightsIO();
void ConvertWeights(bool align_to_popart);
void WeightsFromPaddle();
void WeightsToPaddle();
// Scope // Scope
void SetScope(const Scope *scope) { scope_ = scope; } void SetScope(const Scope *scope) { scope_ = scope; }
...@@ -79,6 +77,10 @@ class Executor { ...@@ -79,6 +77,10 @@ class Executor {
private: private:
void AcquireDevice(); void AcquireDevice();
void SetWeightsIO();
void ConvertWeights(bool);
void WeightsFromPaddle();
void WeightsToPaddle();
private: private:
// not own // not own
...@@ -92,8 +94,6 @@ class Executor { ...@@ -92,8 +94,6 @@ class Executor {
std::unique_ptr<popart::Session> session_; std::unique_ptr<popart::Session> session_;
// one OneSession means a graph // one OneSession means a graph
std::unique_ptr<ExecutorResources> executor_resources_; std::unique_ptr<ExecutorResources> executor_resources_;
int step_ = 0;
}; };
} // namespace ipu } // namespace ipu
......
...@@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index"; ...@@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index";
static constexpr const char *sIpuStageAttr = "ipu_stage"; static constexpr const char *sIpuStageAttr = "ipu_stage";
static constexpr const char *sMatmulSerializeFactor = "serialize_factor"; static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
static constexpr const char *sMatmulSerializeMode = "serialize_mode"; static constexpr const char *sMatmulSerializeMode = "serialize_mode";
static constexpr const char *sAvailMemAttribute = "__available_memory";
static constexpr const char *sOpNamescope = "op_namescope";
static constexpr const char *sOpIdentifyIdAttr = "op_identify_id"; static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
static constexpr const char *sDebugInfoId = "__debug_info_id"; static constexpr const char *sDebugInfoId = "__debug_info_id";
......
...@@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() { ...@@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() {
[&]() { return name; }) [&]() { return name; })
ADD_BOOL_OPTION(is_training); ADD_BOOL_OPTION(is_training);
ADD_BOOL_OPTION(save_init_onnx);
ADD_BOOL_OPTION(save_onnx_checkpoint);
ADD_BOOL_OPTION(need_avg_shard); ADD_BOOL_OPTION(need_avg_shard);
ADD_BOOL_OPTION(enable_fp16); ADD_BOOL_OPTION(enable_fp16);
ADD_BOOL_OPTION(transfer_cast_op);
ADD_BOOL_OPTION(use_no_bias_optimizer);
ADD_BOOL_OPTION(enable_distribution);
ADD_UINT64_OPTION(num_ipus); ADD_UINT64_OPTION(num_ipus);
ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(batches_per_step);
ADD_UINT64_OPTION(micro_batch_size); ADD_UINT64_OPTION(micro_batch_size);
ADD_UINT64_OPTION(save_per_n_step); ADD_UINT64_OPTION(random_seed);
ADD_DOUBLE_OPTION(available_memory_proportion); ADD_DOUBLE_OPTION(available_memory_proportion);
ADD_DOUBLE_OPTION(loss_scaling); ADD_DOUBLE_OPTION(loss_scaling);
ADD_DOUBLE_OPTION(max_weight_norm); ADD_DOUBLE_OPTION(max_weight_norm);
ADD_STRING_OPTION(accl1_type);
ADD_STRING_OPTION(accl2_type);
ADD_STRING_OPTION(accl3_type);
ADD_STRING_OPTION(onnx_dump_path);
ADD_STRING_OPTION(weight_decay_mode);
#undef ADD_STRING_OPTION #undef ADD_STRING_OPTION
#undef ADD_DOUBLE_OPTION #undef ADD_DOUBLE_OPTION
#undef ADD_UINT64_OPTION #undef ADD_UINT64_OPTION
#undef ADD_BOOL_OPTION #undef ADD_BOOL_OPTION
#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name) \
RegisterSetter(bool_options, #name, \
[&](bool value) { runtime_options.aliased_name = value; }); \
RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \
return std::to_string(runtime_options.aliased_name); \
})
ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval);
#undef ADD_RUNTIME_BOOL_OPTION
#define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType) \ #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType) \
RegisterSetter(uint64_options, #name, [&](std::uint64_t value) { \ RegisterSetter(uint64_options, #name, [&](std::uint64_t value) { \
PADDLE_ENFORCE_LT( \ PADDLE_ENFORCE_LT( \
...@@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() { ...@@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() {
ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold, ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
mergeVarUpdateMemThreshold); mergeVarUpdateMemThreshold);
ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak); ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount);
ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor); ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler); ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor, ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
...@@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor, ...@@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
} else if (opt == "use_io_tiles_to_store") { } else if (opt == "use_io_tiles_to_store") {
settings->location.storageTileSet = settings->location.storageTileSet =
value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
} else if (opt == "sharding_domain_with_all") {
settings->location.shardingDomain =
popart::CommGroup(popart::CommGroupType::All, value);
} else if (opt == "sharding_domain_with_consecutive") {
settings->location.shardingDomain =
popart::CommGroup(popart::CommGroupType::Consecutive, value);
} else if (opt == "sharding_domain_with_orthogonal") {
settings->location.shardingDomain =
popart::CommGroup(popart::CommGroupType::Orthogonal, value);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Unknown option ' %s' for tensor location: %s", opt, tensor)); "Unknown option ' %s' for tensor location: %s", opt, tensor));
} }
} }
void IpuStrategy::SetAccumulateOuterFragmentSettings(
const std::uint64_t& schedule, const std::vector<int>& values) {
VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
auto schedule_ =
static_cast<popart::AccumulateOuterFragmentSchedule>(schedule);
popart_options.accumulateOuterFragmentSettings =
popart::AccumulateOuterFragmentSettings(schedule_, values);
}
void IpuStrategy::AddCustomOp(const std::string& paddle_op, void IpuStrategy::AddCustomOp(const std::string& paddle_op,
const std::string& popart_op, const std::string& popart_op,
const std::string& domain, int version) { const std::string& domain, int version) {
......
...@@ -24,6 +24,11 @@ namespace paddle { ...@@ -24,6 +24,11 @@ namespace paddle {
namespace platform { namespace platform {
namespace ipu { namespace ipu {
struct RuntimeOptions {
// enable the eval mode in training by switching optimizers.
bool enable_eval = false;
};
class IpuStrategy { class IpuStrategy {
public: public:
IpuStrategy(); IpuStrategy();
...@@ -32,19 +37,24 @@ class IpuStrategy { ...@@ -32,19 +37,24 @@ class IpuStrategy {
// training flag, true for training // training flag, true for training
bool is_training = true; bool is_training = true;
// save the onnx model lowered by paddle program description
bool save_init_onnx = false;
// save the trained model
bool save_onnx_checkpoint = false;
// average sharding, debugging used // average sharding, debugging used
bool need_avg_shard = false; bool need_avg_shard = false;
// flag for fp16, true for pure fp16 // flag for fp16, true for pure fp16
bool enable_fp16 = false; bool enable_fp16 = false;
// Number ipus total needed, replica * ipu_per_replica // enable transfer cast Op target from fp32 to fp16 in fp16 mode
bool transfer_cast_op = true;
// The mode of Adam/Lamb optimizer
// false: The standard Adam/Lamb optimizer
// true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
bool use_no_bias_optimizer = false;
// enable distributed computing for POD128 or POD256
bool enable_distribution = false;
// Number ipus total needed, local_replica * ipu_per_replica
int num_ipus = 1; int num_ipus = 1;
// batches per step // batches per step
...@@ -53,8 +63,8 @@ class IpuStrategy { ...@@ -53,8 +63,8 @@ class IpuStrategy {
// micro batch-size // micro batch-size
int micro_batch_size = 1; int micro_batch_size = 1;
// save paddle model per n steps // random seed
int save_per_n_step = 1; std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
// TODO(alleng) remove this param // TODO(alleng) remove this param
// available memory proportion, 0.0f for disable // available memory proportion, 0.0f for disable
...@@ -67,6 +77,29 @@ class IpuStrategy { ...@@ -67,6 +77,29 @@ class IpuStrategy {
// defaultMaxWeightNorm for adam optimizer // defaultMaxWeightNorm for adam optimizer
float max_weight_norm = 65504.0f; float max_weight_norm = 65504.0f;
// file path for dumping compiled model in onnx format
std::string onnx_dump_path;
// Data type to use for tensor that stores first-order momentum optimizer
// state. FLOAT or FLOAT16
std::string accl1_type = "FLOAT";
// Data type to use for tensor that stores second-order momentum optimizer
// state. FLOAT or FLOAT16
std::string accl2_type = "FLOAT";
// Data type to use for tensor that stores third-order momentum optimizer
// state. FLOAT or FLOAT16
std::string accl3_type = "FLOAT";
// WeightDecayMode for setting the optimizer
// if set, it will override other settings
// value must be one of "decay" or "l2_regularization" or not set
std::string weight_decay_mode = "";
// Runtime Options
RuntimeOptions runtime_options;
// popart session option // popart session option
popart::SessionOptions popart_options; popart::SessionOptions popart_options;
...@@ -86,6 +119,8 @@ class IpuStrategy { ...@@ -86,6 +119,8 @@ class IpuStrategy {
const std::string &value); const std::string &value);
void SetTensorLocation(const std::string &tensor, const std::string &option, void SetTensorLocation(const std::string &tensor, const std::string &option,
std::uint64_t value); std::uint64_t value);
void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
const std::vector<int> &values);
void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
const std::string &domain, int version); const std::string &domain, int version);
......
...@@ -34,15 +34,36 @@ Node *logical_not_handler(Graph *graph, Node *node) { ...@@ -34,15 +34,36 @@ Node *logical_not_handler(Graph *graph, Node *node) {
{GetOutputVarNode("Out", node)}, {}); {GetOutputVarNode("Out", node)}, {});
} }
Node *logical_or_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_logical_or",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
{GetOutputVarNode("Out", node)}, {});
}
Node *logical_and_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_logical_and",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
{GetOutputVarNode("Out", node)}, {});
}
Node *greater_than_handler(Graph *graph, Node *node) { Node *greater_than_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_greater", return CreateBaseOp(graph, node, "popart_greater",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)}, {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
{GetOutputVarNode("Out", node)}, {}); {GetOutputVarNode("Out", node)}, {});
} }
Node *less_than_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_less",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
{GetOutputVarNode("Out", node)}, {});
}
REGISTER_HANDLER(equal, equal_handler); REGISTER_HANDLER(equal, equal_handler);
REGISTER_HANDLER(logical_not, logical_not_handler); REGISTER_HANDLER(logical_not, logical_not_handler);
REGISTER_HANDLER(logical_or, logical_or_handler);
REGISTER_HANDLER(logical_and, logical_and_handler);
REGISTER_HANDLER(greater_than, greater_than_handler); REGISTER_HANDLER(greater_than, greater_than_handler);
REGISTER_HANDLER(less_than, less_than_handler);
} // namespace } // namespace
} // namespace ipu } // namespace ipu
......
...@@ -98,6 +98,12 @@ Node *matmul_handler(Graph *graph, Node *node) { ...@@ -98,6 +98,12 @@ Node *matmul_handler(Graph *graph, Node *node) {
if (x_rank == 1) { if (x_rank == 1) {
perm = std::vector<int64_t>{0}; perm = std::vector<int64_t>{0};
} else if (x_rank == 2) { } else if (x_rank == 2) {
if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) {
return CreateBaseOp(
graph, node, "popart_matmul",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
node->outputs);
}
return CreateGemm(graph, node, return CreateGemm(graph, node,
{GetInputVarNode("X", node), GetInputVarNode("Y", node)}, {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
node->outputs, transpose_x, transpose_y, alpha); node->outputs, transpose_x, transpose_y, alpha);
......
...@@ -32,30 +32,10 @@ const std::string GenerateOpName() { ...@@ -32,30 +32,10 @@ const std::string GenerateOpName() {
const std::string CreateOpIdentifyId(Node *node) { const std::string CreateOpIdentifyId(Node *node) {
// format: // format:
// if has custom op_namescope: // op_type/_gen_*
// {op_namescope}/op_type/_gen_*
// else:
// {op_type}/{out_var0}/{out_var1}/.../_gen_*
// this name will be used as op name when exporting onnx model from popart // this name will be used as op name when exporting onnx model from popart
auto op_type = node->Name(); auto op_type = node->Name();
std::string op_namescope; return {op_type + "/" + GenerateOpName()};
if (node->Op()->HasAttr("op_namescope")) {
op_namescope =
BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope"));
} else {
op_namescope = "/";
}
if (op_namescope != "/") {
return {op_namescope + op_type + "/" + GenerateOpName()};
} else {
std::string op_out = "";
for (auto *out_node : node->outputs) {
op_out += "/";
op_out += out_node->Name();
}
return {op_type + op_out + "/" + GenerateOpName()};
}
} }
Node *MakeVarNode(Graph *graph, Node *node) { Node *MakeVarNode(Graph *graph, Node *node) {
...@@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type, ...@@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
if (node->Op()->HasAttr(sMatmulSerializeMode)) { if (node->Op()->HasAttr(sMatmulSerializeMode)) {
CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op()); CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op());
} }
if (node->Op()->HasAttr(sAvailMemAttribute)) {
CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op());
}
if (node->Op()->HasAttr(sOpNamescope)) {
CopyOpAttr(sOpNamescope, node->Op(), new_node->Op());
}
{ {
new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node)); new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
new_node->Op()->Flush(); new_node->Op()->Flush();
......
...@@ -54,10 +54,36 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) { ...@@ -54,10 +54,36 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) {
node->outputs); node->outputs);
} }
Node *custom_nll_loss_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction"));
auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignoreIndex"));
auto inputIsLogProbability =
BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability"));
return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs,
node->outputs,
{{"reduction", reduction},
{"ignoreIndex", ignoreIndex},
{"inputIsLogProbability", inputIsLogProbability}});
}
Node *identity_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_identity", node->inputs,
node->outputs);
}
Node *detach_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs,
node->outputs);
}
REGISTER_HANDLER(custom_op, custom_op_handler); REGISTER_HANDLER(custom_op, custom_op_handler);
REGISTER_HANDLER(print, print_handler); REGISTER_HANDLER(print, print_handler);
REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler); REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler);
REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler); REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler);
REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler);
REGISTER_HANDLER(identity, identity_handler);
REGISTER_HANDLER(detach, detach_handler);
} // namespace } // namespace
} // namespace ipu } // namespace ipu
......
...@@ -49,6 +49,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) { ...@@ -49,6 +49,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
case framework::proto::VarType::INT64: case framework::proto::VarType::INT64:
value = std::vector<int64_t>(size, value_); value = std::vector<int64_t>(size, value_);
break; break;
case framework::proto::VarType::BOOL:
value = std::vector<bool>(size, value_);
break;
default: default:
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("fill_constant dtype: %d", dtype_)); platform::errors::Unimplemented("fill_constant dtype: %d", dtype_));
...@@ -417,6 +420,45 @@ Node *assign_handler(Graph *graph, Node *node) { ...@@ -417,6 +420,45 @@ Node *assign_handler(Graph *graph, Node *node) {
{GetOutputVarNode("Out", node)}, {}); {GetOutputVarNode("Out", node)}, {});
} }
Node *assign_value_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
auto dtype = VarType2OnnxDtype(dtype_);
auto dims_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
std::vector<int64_t> dims(dims_.begin(), dims_.end());
Attribute values;
std::string value_name;
switch (dtype_) {
case framework::proto::VarType::BOOL: {
value_name = "bool_values";
auto vec_int = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
std::vector<bool> vec_bool(vec_int.begin(), vec_int.end());
values = vec_bool;
} break;
case framework::proto::VarType::INT32:
value_name = "int32_values";
values = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
break;
case framework::proto::VarType::FP32:
value_name = "fp32_values";
values = BOOST_GET_CONST(std::vector<float>, op->GetAttr(value_name));
break;
case framework::proto::VarType::INT64:
value_name = "int64_values";
values = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr(value_name));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type(code %d) for AssignValue operator, only "
"supports bool, int32, float32 and int64.",
dtype));
}
return CreateConst(graph, node, node->inputs, node->outputs,
AttributeMap{
{"value", values}, {"dims", dims}, {"dtype", dtype},
});
}
Node *fill_any_like_handler(Graph *graph, Node *node) { Node *fill_any_like_handler(Graph *graph, Node *node) {
auto *op = node->Op(); auto *op = node->Op();
auto value = BOOST_GET_CONST(float, op->GetAttr("value")); auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
...@@ -482,6 +524,41 @@ Node *one_hot_handler(Graph *graph, Node *node) { ...@@ -482,6 +524,41 @@ Node *one_hot_handler(Graph *graph, Node *node) {
} }
} }
Node *one_hot_v2_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto depth = BOOST_GET_CONST(int, op->GetAttr("depth"));
auto allow_out_of_range =
BOOST_GET_CONST(bool, op->GetAttr("allow_out_of_range"));
if (allow_out_of_range) {
PADDLE_THROW(platform::errors::Unimplemented(
"Do not support allow_out_of_range=True"));
} else {
auto depth_tensor =
CreateConst(graph, node, {}, {}, {{"value", std::vector<int>{depth}},
{"dims", std::vector<int64_t>{1}},
{"dtype", ONNXDataType::INT32}});
Node *value_tensor = nullptr;
if (GetOutputVarNode("Out", node)->Var()->GetDataType() ==
framework::proto::VarType::FP16) {
value_tensor =
CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
{"dims", std::vector<int64_t>{2}},
{"dtype", ONNXDataType::FLOAT16}});
} else {
value_tensor =
CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
{"dims", std::vector<int64_t>{2}},
{"dtype", ONNXDataType::FLOAT}});
}
return CreateBaseOp(graph, node, "popart_onehot",
{GetInputVarNode("X", node), depth_tensor->outputs[0],
value_tensor->outputs[0]},
{GetOutputVarNode("Out", node)},
{{"axis", int64_t{-1}}});
}
}
Node *split_handler(Graph *graph, Node *node) { Node *split_handler(Graph *graph, Node *node) {
auto *op = node->Op(); auto *op = node->Op();
auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
...@@ -510,10 +587,12 @@ REGISTER_HANDLER(shape, shape_handler); ...@@ -510,10 +587,12 @@ REGISTER_HANDLER(shape, shape_handler);
REGISTER_HANDLER(slice, slice_handler); REGISTER_HANDLER(slice, slice_handler);
REGISTER_HANDLER(expand, expand_handler); REGISTER_HANDLER(expand, expand_handler);
REGISTER_HANDLER(assign, assign_handler); REGISTER_HANDLER(assign, assign_handler);
REGISTER_HANDLER(assign_value, assign_value_handler);
REGISTER_HANDLER(fill_any_like, fill_any_like_handler); REGISTER_HANDLER(fill_any_like, fill_any_like_handler);
REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler); REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler);
REGISTER_HANDLER(split, split_handler); REGISTER_HANDLER(split, split_handler);
REGISTER_HANDLER(one_hot, one_hot_handler); REGISTER_HANDLER(one_hot, one_hot_handler);
REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler);
} // namespace } // namespace
} // namespace ipu } // namespace ipu
......
...@@ -51,16 +51,20 @@ XPUOpMap& get_kl2_ops() { ...@@ -51,16 +51,20 @@ XPUOpMap& get_kl2_ops() {
{"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
{"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, pOpKernelType(vartype::FP16, XPUPlace())})},
{"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"conv2d_transpose_grad", {"conv2d_transpose_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"conv2d_transpose", {"conv2d_transpose",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"depthwise_conv2d_grad", {"depthwise_conv2d_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"depthwise_conv2d", {"depthwise_conv2d",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"dropout_grad", {"dropout_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
......
...@@ -916,6 +916,11 @@ class DeviceContextPool { ...@@ -916,6 +916,11 @@ class DeviceContextPool {
size_t size() const { return device_contexts_.size(); } size_t size() const { return device_contexts_.size(); }
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
device_contexts() const {
return device_contexts_;
}
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
......
...@@ -4264,6 +4264,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4264,6 +4264,7 @@ All parameter, weight, gradient are variables in Paddle.
platform::ipu::IpuBackend::GetInstance()); platform::ipu::IpuBackend::GetInstance());
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
.def("detach", &platform::ipu::IpuBackend::Detach) .def("detach", &platform::ipu::IpuBackend::Detach)
.def("reset", &platform::ipu::IpuBackend::Reset) .def("reset", &platform::ipu::IpuBackend::Reset)
.def("set_scope", &platform::ipu::IpuBackend::SetScope) .def("set_scope", &platform::ipu::IpuBackend::SetScope)
...@@ -4311,6 +4312,15 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4311,6 +4312,15 @@ All parameter, weight, gradient are variables in Paddle.
option_name, option.first.cast<std::string>(), option_name, option.first.cast<std::string>(),
option.second.cast<std::uint64_t>()); option.second.cast<std::uint64_t>());
} }
} else if (option_name == "accumulate_outer_fragment") {
for (auto option : element.second.cast<py::dict>()) {
std::vector<int> values;
for (auto value : option.second.cast<py::list>()) {
values.push_back(value.cast<int>());
}
self.SetAccumulateOuterFragmentSettings(
option.first.cast<std::uint64_t>(), values);
}
} else if (option_name == "custom_op") { } else if (option_name == "custom_op") {
std::string paddle_op; std::string paddle_op;
std::string popart_op; std::string popart_op;
......
...@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator { ...@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
auto arg = predict_func.getArgument(i); auto arg = predict_func.getArgument(i);
auto type = arg.getType(); auto type = arg.getType();
// this param is TensorMap // this param is TensorMap
if (type.isa<infrt::DenseTensorMapType>()) { if (type.isa<infrt::DenseHostTensorMapType>()) {
auto* value = new host_context::Value(std::move(*map)); auto* value = new host_context::Value(std::move(*map));
arguments_.push_back(value); arguments_.push_back(value);
AddValue(predict_func.getArgument(i), value); AddValue(predict_func.getArgument(i), value);
......
...@@ -106,7 +106,7 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> { ...@@ -106,7 +106,7 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
// input path of model params. // input path of model params.
let arguments = (ins StrAttr:$path); let arguments = (ins StrAttr:$path);
let results = (outs DenseTensorMap:$out); let results = (outs DenseHostTensorMap:$out);
let assemblyFormat = "`(``)`attr-dict"; let assemblyFormat = "`(``)`attr-dict";
} }
...@@ -121,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { ...@@ -121,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
// input path of model params. // input path of model params.
let arguments = (ins let arguments = (ins
DenseTensorMap:$map, DenseHostTensorMap:$map,
StrAttr:$name StrAttr:$name
); );
let results = (outs DenseTensor:$output); let results = (outs DenseTensor:$output);
...@@ -136,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { ...@@ -136,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
An operation that get the size of a TensorMap. An operation that get the size of a TensorMap.
}]; }];
let arguments = (ins DenseTensorMap:$map); let arguments = (ins DenseHostTensorMap:$map);
let results = (outs I32:$size); let results = (outs I32:$size);
let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
} }
......
...@@ -83,7 +83,7 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { ...@@ -83,7 +83,7 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
); );
} }
def DenseTensorMap : Infrt_Type<"DenseTensorMap"> { def DenseHostTensorMap : Infrt_Type<"DenseHostTensorMap"> {
let summary = "infrt dense tensor map"; let summary = "infrt dense tensor map";
let description = [{dense_tensor map}]; let description = [{dense_tensor map}];
let parameters = (ins); let parameters = (ins);
......
...@@ -91,7 +91,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { ...@@ -91,7 +91,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
parser.getContext(), shape, elementType, lod_level); parser.getContext(), shape, elementType, lod_level);
} }
if (keyword == "dense_tensor_map") { if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext()); return DenseHostTensorMapType::get(parser.getContext());
} }
if (keyword == "dense_tensor") { if (keyword == "dense_tensor") {
// parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32> // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
...@@ -162,7 +162,7 @@ void InfrtDialect::printType(::mlir::Type type, ...@@ -162,7 +162,7 @@ void InfrtDialect::printType(::mlir::Type type,
<< lod_tensor_type.getLod_level() << ">"; << lod_tensor_type.getLod_level() << ">";
return; return;
} }
if (type.isa<infrt::DenseTensorMapType>()) { if (type.isa<infrt::DenseHostTensorMapType>()) {
os << "dense_tensor_map"; os << "dense_tensor_map";
return; return;
} }
...@@ -180,12 +180,6 @@ void InfrtDialect::printType(::mlir::Type type, ...@@ -180,12 +180,6 @@ void InfrtDialect::printType(::mlir::Type type,
os << "tensor_list"; os << "tensor_list";
return; return;
} }
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorMapType>()) {
os << "dense_tensor_map";
return;
}
llvm_unreachable("unknown infrt type."); llvm_unreachable("unknown infrt type.");
} }
......
...@@ -23,16 +23,16 @@ ...@@ -23,16 +23,16 @@
namespace infrt { namespace infrt {
phi::Backend ConvertTargetToPhi(TargetType target); ::phi::Backend ConvertTargetToPhi(TargetType target);
TargetType ConvertTargetFromPhi(phi::Backend backend); TargetType ConvertTargetFromPhi(::phi::Backend backend);
phi::DataType ConvertPrecisionToPhi(PrecisionType precision); ::phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype); PrecisionType ConvertPrecisionFromPhi(::phi::DataType datatype);
phi::DataLayout ConvertLayoutToPhi(LayoutType layout); ::phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
LayoutType ConvertLayoutFromPhi(phi::DataLayout layout); LayoutType ConvertLayoutFromPhi(::phi::DataLayout layout);
phi::KernelKey ConvertPlaceToPhi(const Place& place); ::phi::KernelKey ConvertPlaceToPhi(const Place& place);
Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg); Place ConvertPlaceFromPhi(::phi::TensorArgDef tensor_arg);
} // namespace infrt } // namespace infrt
...@@ -37,4 +37,8 @@ def Allocator : PHI_Type<"Allocator"> { ...@@ -37,4 +37,8 @@ def Allocator : PHI_Type<"Allocator"> {
let assemblyFormat = "`<` $target `>`"; let assemblyFormat = "`<` $target `>`";
} }
def PD_DenseTensorMap : PHI_Type<"DenseTensorMap"> {
let mnemonic = "dense_tensor_map";
}
#endif #endif
...@@ -51,12 +51,46 @@ class CreateContextOp<string target> ...@@ -51,12 +51,46 @@ class CreateContextOp<string target>
let results = (outs Context:$output); let results = (outs Context:$output);
} }
def PDT_LoadParamsOp : PDT_Op<"load_params", [NoSideEffect]> {
// input path of model params.
let arguments = (ins StrAttr:$path);
let results = (outs PD_DenseTensorMap:$out);
let assemblyFormat = "`(``)`attr-dict";
}
def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> {
// input path of model params.
let arguments = (ins StrAttr:$model_path, StrAttr:$params_path);
let results = (outs PD_DenseTensorMap:$out);
let assemblyFormat = "`(``)`attr-dict";
}
def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> {
let arguments = (ins PD_DenseTensorMap:$map);
let results = (outs I32:$size);
let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
}
class TensorMapGetTensorOp:
PDT_Op<"tensor_map_get_tensor"> {
let arguments = (ins
PD_DenseTensorMap:$map,
StrAttr:$name
);
let results = (outs DenseTensor:$output);
let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)";
let verifier = ?;
}
def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">; def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">; def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">; def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
def PDT_CreateCPUContextOp : CreateContextOp<"cpu">; def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
def PDT_CreateGPUContextOp : CreateContextOp<"gpu">; def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
def PDT_PrintDenseTensor : PrintDenseTensorOp; def PDT_PrintDenseTensor : PrintDenseTensorOp;
def PDT_TensorMapGetTensorOp: TensorMapGetTensorOp;
def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
......
...@@ -351,18 +351,26 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( ...@@ -351,18 +351,26 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
auto attrs = op->getAttrs(); auto attrs = op->getAttrs();
// MLIR's underlying attr storage type is `Builtin_Dictionary`, and its // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its
// elements // elements are sorted by name. The following code adapts the order of
// are sorted by name. The following code adapts the order of function // function signatures of the phi operator library.
// signatures
// of the phi operator library.
llvm::SmallVector<Value*, 4> tmp; llvm::SmallVector<Value*, 4> tmp;
tmp.resize(attrs.size()); tmp.resize(attrs.size());
const std::string& kernel_name = op->getName().getStringRef().str(); const std::string& kernel_name = op->getName().getStringRef().str();
const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name); const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name);
if (attrs.size() && attr_names.empty()) { if (attrs.size()) {
LOG(WARNING) << "The kernel `" << kernel_name if (attr_names.empty()) {
<< "` has no specified attr order."; LOG(WARNING) << "The kernel `" << kernel_name
<< "` has not been registered with "
"`KernelRegistry::AddKernelWithAttrs()`.";
} else {
CHECK_EQ(attr_names.size(), attrs.size())
<< "The number of kernel `" << kernel_name
<< "` attributes specified by mlir (" << attrs.size()
<< ") is inconsistent with the registration (" << attr_names.size()
<< ").";
}
} }
auto get_offset = [](const char* attr, auto get_offset = [](const char* attr,
const std::vector<const char*>& names, const std::vector<const char*>& names,
const std::string& kernel_name) -> int { const std::string& kernel_name) -> int {
...@@ -385,7 +393,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( ...@@ -385,7 +393,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
} else { } else {
offset = i; offset = i;
} }
CHECK_NE(offset, -1); CHECK_GT(offset, -1);
if (auto v = EmitAttribute<int32_t>(attr.getValue())) { if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
tmp[offset] = new Value(*v); tmp[offset] = new Value(*v);
} else if (auto v = EmitAttribute<int64_t>(attr.getValue())) { } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
......
...@@ -79,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule( ...@@ -79,7 +79,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType( llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
const infrt::paddle::framework_proto::ProgramDesc &program) { const infrt::paddle::framework_proto::ProgramDesc &program) {
llvm::SmallVector<mlir::Type, 4> operandTypes; llvm::SmallVector<mlir::Type, 4> operandTypes;
operandTypes.push_back(infrt::DenseTensorMapType::get(context_)); operandTypes.push_back(infrt::DenseHostTensorMapType::get(context_));
for (auto &op_desc : main_block_.ops()) { for (auto &op_desc : main_block_.ops()) {
if (op_desc.type() != "feed") continue; if (op_desc.type() != "feed") continue;
for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) { for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#ifdef INFRT_WITH_PHI #ifdef INFRT_WITH_PHI
#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/backends/host/phi_allocator.h"
#include "paddle/infrt/backends/host/phi_context.h" #include "paddle/infrt/backends/host/phi_context.h"
#include "paddle/infrt/tensor/phi/tensor_map.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/backend.h" #include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
...@@ -84,22 +85,23 @@ using ValueVariantType = ...@@ -84,22 +85,23 @@ using ValueVariantType =
#ifdef INFRT_WITH_GPU #ifdef INFRT_WITH_GPU
backends::GpuPhiContext, backends::GpuPhiContext,
::phi::GPUContext, ::phi::GPUContext,
#endif #endif // INFRT_WITH_GPU
::phi::CPUContext, ::phi::CPUContext,
std::vector<const phi::DenseTensor*>, std::vector<const ::phi::DenseTensor*>,
std::vector<phi::DenseTensor*>, std::vector<::phi::DenseTensor*>,
paddle::experimental::ScalarBase<phi::DenseTensor>, paddle::experimental::ScalarBase<::phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>, paddle::experimental::ScalarArrayBase<::phi::DenseTensor>,
std::vector<phi::MetaTensor*>, std::vector<::phi::MetaTensor*>,
phi::MetaConfig, ::phi::MetaConfig,
paddle::experimental::Backend, paddle::experimental::Backend,
paddle::experimental::DataLayout, paddle::experimental::DataLayout,
paddle::experimental::DataType, paddle::experimental::DataType,
::infrt::phi::DenseTensorMap,
#endif // INFRT_WITH_PHI
#ifdef INFRT_WITH_TRT #ifdef INFRT_WITH_TRT
::infrt::backends::tensorrt::TrtEngine, ::infrt::backends::tensorrt::TrtEngine,
::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol, ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
#endif // INFRT_WITH_TRT #endif // INFRT_WITH_TRT
#endif
std::vector<int16_t>, std::vector<int16_t>,
std::vector<int32_t>, std::vector<int32_t>,
std::vector<int64_t>, std::vector<int64_t>,
...@@ -136,6 +138,7 @@ class Value : public common::Object { ...@@ -136,6 +138,7 @@ class Value : public common::Object {
explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {} explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
explicit Value(MlirFunctionExecutable* x) : data(x) {} explicit Value(MlirFunctionExecutable* x) : data(x) {}
#ifdef INFRT_WITH_PHI #ifdef INFRT_WITH_PHI
explicit Value(::infrt::phi::DenseTensorMap&& x) : data(std::move(x)) {}
explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_GPU #ifdef INFRT_WITH_GPU
......
...@@ -13,8 +13,11 @@ ...@@ -13,8 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
#include "paddle/infrt/common/string.h"
#include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/dialect/phi/data_type.h"
#include "paddle/infrt/kernel/phi/context_kernels.h" #include "paddle/infrt/kernel/phi/context_kernels.h"
#include "paddle/infrt/paddle/model_parser.h"
#include "paddle/infrt/paddle/scope.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
...@@ -22,6 +25,18 @@ ...@@ -22,6 +25,18 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
namespace paddle {
namespace platform {
using DeviceContext = ::phi::DeviceContext;
} // namespace platform
namespace framework {
using LoDTensor = ::phi::DenseTensor;
void DeserializeFromStream(std::istream& is,
LoDTensor* tensor,
const platform::DeviceContext& dev_ctx);
}
} // namespace paddle
namespace infrt { namespace infrt {
namespace kernel { namespace kernel {
namespace phi { namespace phi {
...@@ -130,6 +145,89 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { ...@@ -130,6 +145,89 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
std::cout << "]\n"; std::cout << "]\n";
#undef PRINT_META_DATA #undef PRINT_META_DATA
} }
::infrt::phi::DenseTensorMap LoadParams(
host_context::Attribute<std::string> path) {
const auto& file_path = path.get();
std::cout << "loading params from: " << file_path << std::endl;
::infrt::phi::DenseTensorMap map;
const std::string model_path = file_path + "/__model__";
auto pb_proto_prog = paddle::LoadProgram(model_path);
auto main_block = pb_proto_prog->blocks(0);
for (auto& var : main_block.vars()) {
if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
continue;
std::string param_path = file_path + "/" + var.name();
std::ifstream param_file(param_path, std::ios::binary);
switch (var.type().type()) {
case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
std::unique_ptr<::phi::DenseTensor> tensor{
std::make_unique<::phi::DenseTensor>()};
::phi::CPUContext ctx;
::paddle::framework::DeserializeFromStream(
param_file, tensor.get(), ctx);
map.SetDenseTensor(var.name(), std::move(tensor));
} break;
default: {
LOG(WARNING) << "Var `" << var.name() << "` type `"
<< static_cast<int>(var.type().type())
<< "` has not been supported now.";
}
}
}
return map;
}
::infrt::phi::DenseTensorMap LoadCombinedParams(
host_context::Attribute<std::string> model_path,
host_context::Attribute<std::string> params_path) {
const auto& model = model_path.get();
std::cout << "loading params from: " << model << std::endl;
::infrt::phi::DenseTensorMap map;
auto pb_proto_prog = paddle::LoadProgram(model);
auto main_block = pb_proto_prog->blocks(0);
std::ifstream param_file(params_path.get(), std::ios::binary);
std::set<std::string> tmp;
for (auto& var : main_block.vars()) {
if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) {
continue;
}
if (var.type().type() ==
::paddle::framework::proto::VarType_Type_LOD_TENSOR) {
tmp.emplace(var.name());
} else {
llvm_unreachable("the tensor type is illegal.");
}
}
for (auto& var : tmp) {
std::unique_ptr<::phi::DenseTensor> tensor{
std::make_unique<::phi::DenseTensor>()};
::phi::CPUContext ctx;
::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx);
map.SetDenseTensor(var, std::move(tensor));
}
return map;
}
::phi::DenseTensor TensorMapGetTensor(
const ::infrt::phi::DenseTensorMap& map,
host_context::Attribute<std::string> name) {
auto* tensor = map.GetDenseTensor(name.get());
CHECK(tensor);
return *tensor;
}
int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) {
return map.size();
}
} // namespace phi } // namespace phi
} // namespace kernel } // namespace kernel
} // namespace infrt } // namespace infrt
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/backends/host/phi_allocator.h"
#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/infrt/host_context/kernel_utils.h"
#include "paddle/infrt/tensor/phi/tensor_map.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace infrt { namespace infrt {
...@@ -41,6 +42,19 @@ void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, ...@@ -41,6 +42,19 @@ void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
host_context::Attribute<std::vector<float>> values); host_context::Attribute<std::vector<float>> values);
void PrintDenseTensor(::phi::DenseTensor* dense_tensor); void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
infrt::phi::DenseTensorMap LoadParams(
host_context::Attribute<std::string> path);
::phi::DenseTensor TensorMapGetTensor(
const ::infrt::phi::DenseTensorMap& map,
host_context::Attribute<std::string> name);
::infrt::phi::DenseTensorMap LoadCombinedParams(
host_context::Attribute<std::string> model_path,
host_context::Attribute<std::string> params_path);
int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
} // namespace phi } // namespace phi
} // namespace kernel } // namespace kernel
} // namespace infrt } // namespace infrt
...@@ -37,15 +37,16 @@ TEST(utils, registry) { ...@@ -37,15 +37,16 @@ TEST(utils, registry) {
CHECK_EQ(count, 2U); CHECK_EQ(count, 2U);
} }
class FancyAllocator : public phi::Allocator { class FancyAllocator : public ::phi::Allocator {
public: public:
static void Delete(phi::Allocation* allocation) { static void Delete(::phi::Allocation* allocation) {
::operator delete(allocation->ptr()); ::operator delete(allocation->ptr());
} }
AllocationPtr Allocate(size_t bytes_size) override { AllocationPtr Allocate(size_t bytes_size) override {
void* data = ::operator new(bytes_size); void* data = ::operator new(bytes_size);
auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace()); auto* allocation =
new ::phi::Allocation(data, bytes_size, ::phi::CPUPlace());
return AllocationPtr(allocation, Delete); return AllocationPtr(allocation, Delete);
} }
}; };
...@@ -56,20 +57,20 @@ TEST(ElementwiseAdd, launcher_registry) { ...@@ -56,20 +57,20 @@ TEST(ElementwiseAdd, launcher_registry) {
ASSERT_GE(registry.size(), 1UL); ASSERT_GE(registry.size(), 1UL);
auto creator = registry.GetKernel("phi_cpu.add.float32.any"); auto creator = registry.GetKernel("phi_cpu.add.float32.any");
const phi::DDim dims({1, 2}); const ::phi::DDim dims({1, 2});
const phi::DataType dtype{phi::DataType::FLOAT32}; const ::phi::DataType dtype{::phi::DataType::FLOAT32};
const phi::DataLayout layout{phi::DataLayout::NHWC}; const ::phi::DataLayout layout{::phi::DataLayout::NHWC};
const phi::LoD lod{}; const ::phi::LoD lod{};
phi::DenseTensorMeta meta(dtype, dims, layout, lod); ::phi::DenseTensorMeta meta(dtype, dims, layout, lod);
auto fancy_allocator = std::unique_ptr<phi::Allocator>(new FancyAllocator); auto fancy_allocator = std::unique_ptr<::phi::Allocator>(new FancyAllocator);
auto* alloc = fancy_allocator.get(); auto* alloc = fancy_allocator.get();
phi::DenseTensor a(alloc, meta); ::phi::DenseTensor a(alloc, meta);
phi::DenseTensor b(alloc, meta); ::phi::DenseTensor b(alloc, meta);
phi::DenseTensor c(alloc, meta); ::phi::DenseTensor c(alloc, meta);
auto place = phi::CPUPlace(); auto place = ::phi::CPUPlace();
float* a_data = a.mutable_data<float>(place); float* a_data = a.mutable_data<float>(place);
float* b_data = b.mutable_data<float>(place); float* b_data = b.mutable_data<float>(place);
float* c_data = c.mutable_data<float>(place); float* c_data = c.mutable_data<float>(place);
...@@ -78,7 +79,7 @@ TEST(ElementwiseAdd, launcher_registry) { ...@@ -78,7 +79,7 @@ TEST(ElementwiseAdd, launcher_registry) {
b_data[i] = 2.f; b_data[i] = 2.f;
} }
phi::CPUContext context; ::phi::CPUContext context;
context.SetAllocator(alloc); context.SetAllocator(alloc);
context.Init(); context.Init();
......
...@@ -53,6 +53,19 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { ...@@ -53,6 +53,19 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
{"dims", "lod", "layout", "precision"}); {"dims", "lod", "layout", "precision"});
#endif #endif
registry->AddKernelWithAttrs("phi_dt.load_params",
INFRT_KERNEL(infrt::kernel::phi::LoadParams),
{"path"});
registry->AddKernelWithAttrs(
"phi_dt.load_combined_params",
INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams),
{"model_path", "params_path"});
registry->AddKernelWithAttrs(
"phi_dt.tensor_map_get_tensor",
INFRT_KERNEL(infrt::kernel::phi::TensorMapGetTensor),
{"name"});
registry->AddKernel("phi_dt.tensor_map_get_size",
INFRT_KERNEL(infrt::kernel::phi::TensorMapGetSize));
} }
} // namespace kernel } // namespace kernel
......
...@@ -68,14 +68,14 @@ int32_t TensorMapGetSize(TensorMap map) { return map.size(); } ...@@ -68,14 +68,14 @@ int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
// TODO(wilber): Maybe we should place TensorList type in dt dialect. // TODO(wilber): Maybe we should place TensorList type in dt dialect.
#ifdef INFRT_WITH_PHI #ifdef INFRT_WITH_PHI
phi::DenseTensor TensorListGetTensor(std::vector<phi::DenseTensor *> list, ::phi::DenseTensor TensorListGetTensor(std::vector<::phi::DenseTensor *> list,
Attribute<int32_t> idx) { Attribute<int32_t> idx) {
CHECK_LT(idx.get(), static_cast<int>(list.size())) CHECK_LT(idx.get(), static_cast<int>(list.size()))
<< "idx should less than list size"; << "idx should less than list size";
return *list[idx.get()]; return *list[idx.get()];
} }
int32_t TensorListGetSize(const std::vector<phi::DenseTensor *> &list) { int32_t TensorListGetSize(const std::vector<::phi::DenseTensor *> &list) {
return list.size(); return list.size();
} }
#endif #endif
......
core_gather_headers() core_gather_headers()
add_subdirectory(phi)
gather_srcs(infrt_src SRCS gather_srcs(infrt_src SRCS
tensor_map.cc tensor_map.cc
tensor_metadata.cc tensor_metadata.cc
......
gather_srcs(infrt_src SRCS
tensor_map.cc
)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/tensor/phi/tensor_map.h"
#include "llvm/Support/ErrorHandling.h"
namespace infrt {
namespace phi {
void DenseTensorMap::SetDenseTensor(
const std::string& name, std::unique_ptr<::phi::DenseTensor>&& tensor) {
std::lock_guard<std::mutex> lock(mu_);
auto it = map_.emplace(std::make_pair(name, std::move(tensor)));
if (!it.second) {
llvm_unreachable("dense tensor map insert failed.");
}
}
::phi::DenseTensor* DenseTensorMap::GetDenseTensor(
const std::string& name) const {
std::lock_guard<std::mutex> lock(mu_);
auto it = map_.find(name);
if (it != map_.end()) {
return it->second.get();
}
LOG(WARNING) << "can not find `" << name << "` in the tensor map.";
return nullptr;
}
size_t DenseTensorMap::size() const {
std::lock_guard<std::mutex> lock(mu_);
return map_.size();
}
} // namespace phi
} // namespace infrt
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,21 +12,26 @@ ...@@ -12,21 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h" #pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
template <typename T> #include "paddle/phi/core/dense_tensor.h"
using CUDAFrobeniusNormKernel =
ops::ReduceKernel<paddle::platform::CUDADeviceContext, T,
ops::FrobeniusNormFunctor>;
REGISTER_OP_CUDA_KERNEL(frobenius_norm, CUDAFrobeniusNormKernel<float>, namespace infrt {
CUDAFrobeniusNormKernel<double>); namespace phi {
template <typename T> class DenseTensorMap {
using CUDAFrobeniusNormGradKernel = public:
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T, DenseTensorMap() = default;
ops::FrobeniusNormGradFunctor>; DenseTensorMap(DenseTensorMap&& other) : map_(std::move(other.map_)) {}
void SetDenseTensor(const std::string& name,
std::unique_ptr<::phi::DenseTensor>&& tensor);
::phi::DenseTensor* GetDenseTensor(const std::string& name) const;
size_t size() const;
REGISTER_OP_CUDA_KERNEL(frobenius_norm_grad, CUDAFrobeniusNormGradKernel<float>, private:
CUDAFrobeniusNormGradKernel<double>); mutable std::mutex mu_;
std::unordered_map<std::string, std::unique_ptr<::phi::DenseTensor>> map_;
};
} // namespace phi
} // namespace infrt
...@@ -12,3 +12,30 @@ func @load_tensor_map() { ...@@ -12,3 +12,30 @@ func @load_tensor_map() {
infrt.return infrt.return
} }
func @load_phi_tensor_map() {
%map = phi_dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
%size = phi_dt.tensor_map_get_size(%map) -> i32
infrt.print.i32 %size
%a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
// CHECK: dense_tensor: shape=shape[2], value=[0,0]
phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
infrt.return
}
func @load_combined_phi_tensor_map() {
%map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdmodel",
params_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdiparams"}
%size = phi_dt.tensor_map_get_size(%map) -> i32
infrt.print.i32 %size
%a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
// CHECK: dense_tensor: shape=shape[2], value=[0,0]
phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
infrt.return
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/macros.h"
#include "paddle/utils/flat_hash_map.h"
namespace phi {
class DeviceContext;
class CPUContext;
class GPUContext;
} // namespace phi
namespace paddle {
namespace experimental {
template <AllocationType T>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<AllocationType::CPU> {
using TYPE = phi::CPUContext;
};
template <>
struct DefaultDeviceContextType<AllocationType::GPU> {
using TYPE = phi::GPUContext;
};
/**
* The DeviceContextPool here is just a mirror of the DeviceContextPool in
* fluid, and does not manage the life cycle of the DeviceContext.
* It is mainly used for external custom operator calls and high-performance
* C++ APIs.
*
* Since DeviceContextPool in fluid is a global singleton, it always exists
* in program running, so DeviceContextPool here can always access the correct
* DeviceContext pointer.
*
* In order not to depend on the fluid's DeviceContextPool,
* the DeviceContextPool here needs to be initialized in the fluid, and cannot
* be initialized by itself.
*/
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
const phi::DeviceContext* Get(const Place& place) const;
phi::DeviceContext* GetMutable(const Place& place);
template <AllocationType T>
const typename DefaultDeviceContextType<T>::TYPE* Get(
const Place& place) const {
return reinterpret_cast<const typename DefaultDeviceContextType<T>::TYPE*>(
Get(place));
}
private:
DeviceContextPool();
paddle::flat_hash_map<Place, const phi::DeviceContext*, Place::Hash>
context_map_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
} // namespace experimental
} // namespace paddle
...@@ -135,8 +135,9 @@ add_custom_command( ...@@ -135,8 +135,9 @@ add_custom_command(
cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place)
cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool)
cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/enforce.h"
namespace paddle {
namespace experimental {
DeviceContextPool& DeviceContextPool::Instance() {
static DeviceContextPool g_device_context_pool;
return g_device_context_pool;
}
const phi::DeviceContext* DeviceContextPool::Get(const Place& place) const {
auto it = context_map_.find(place);
PADDLE_ENFORCE_NE(
it,
context_map_.end(),
phi::errors::NotFound("The DeviceContext of %s does not exists.", place));
return it->second;
}
phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
return const_cast<phi::DeviceContext*>(Get(place));
}
DeviceContextPool::DeviceContextPool() {
// We need to make sure that the correct value exists
// whenever we get the DeviceContext from DeviceContextPool
const auto& device_contexts =
paddle::platform::DeviceContextPool::Instance().device_contexts();
for (const auto& pair : device_contexts) {
// only get CPU and GPU DeviceContext now, add other DeviceContext type
// later if needed
if (platform::is_cpu_place(pair.first)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
||
platform::is_gpu_place(pair.first)) {
#else
) {
#endif
const phi::DeviceContext* dev_ctx = pair.second.get().get();
VLOG(3) << "Init phi DeviceContextPool: insert {" << pair.first << ", "
<< dev_ctx << "}";
context_map_[pair.first] = dev_ctx;
}
}
}
} // namespace experimental
} // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
namespace paddle { namespace paddle {
...@@ -52,8 +53,8 @@ std::size_t CountLeadingZeros(uint64_t val) { ...@@ -52,8 +53,8 @@ std::size_t CountLeadingZeros(uint64_t val) {
} // namespace detail } // namespace detail
phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) { phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
auto& pool = paddle::platform::DeviceContextPool::Instance(); auto& pool = paddle::experimental::DeviceContextPool::Instance();
return pool.Get(phi::TransToPhiPlace(backend)); return pool.GetMutable(phi::TransToPhiPlace(backend));
} }
DataType ParseDataType(DataType dtype) { return dtype; } DataType ParseDataType(DataType dtype) { return dtype; }
......
...@@ -92,4 +92,20 @@ std::string GetGlobalDeviceType(size_t device_type_id) { ...@@ -92,4 +92,20 @@ std::string GetGlobalDeviceType(size_t device_type_id) {
return global_registered_device_type[device_type_id]; return global_registered_device_type[device_type_id];
} }
constexpr static int kAllocationTypeBitLength = 8;
constexpr static int kDeviceTypeIDBitLength = 8;
constexpr static int kDeviceIDBitLength = 8;
uint32_t Place::Hash::operator()(const Place &place) const {
uint32_t hash_value = 0;
// |----31-24------|-----23-16------|-----15-08----|---7-0----|
// | For extension | AllocationType | DeviceTypeID | DeviceID |
hash_value |= (static_cast<uint8_t>(place.alloc_type_)
<< (kDeviceIDBitLength + kDeviceTypeIDBitLength));
hash_value |=
(static_cast<uint8_t>(place.device_type_id_) << kDeviceIDBitLength);
hash_value |= static_cast<uint8_t>(place.device);
return hash_value;
}
} // namespace phi } // namespace phi
...@@ -73,31 +73,23 @@ class Place { ...@@ -73,31 +73,23 @@ class Place {
std::string DebugString() const; std::string DebugString() const;
struct Hash {
// Note: Now the number of bits we need does not exceed 32 bits, so there is
// no need to use 64 bits. If needed in the future, it can be expanded,
// but now we don’t over-design.
uint32_t operator()(const Place& place) const;
};
uint32_t HashValue() const { return Hash()(*this); }
inline bool operator==(const Place& rhs) const { inline bool operator==(const Place& rhs) const {
if (alloc_type_ != rhs.GetType()) { return HashValue() == rhs.HashValue();
return false; }
} inline bool operator!=(const Place& rhs) const {
if (alloc_type_ == AllocationType::CPU || return HashValue() != rhs.HashValue();
alloc_type_ == AllocationType::GPUPINNED ||
alloc_type_ == AllocationType::NPUPINNED) {
return true;
}
if (alloc_type_ == AllocationType::CUSTOM) {
return device_type_id_ == rhs.device_type_id_ &&
device == rhs.GetDeviceId();
}
return device == rhs.GetDeviceId();
} }
inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
inline bool operator<(const Place& rhs) const { inline bool operator<(const Place& rhs) const {
if (alloc_type_ != rhs.GetType()) { return HashValue() < rhs.HashValue();
return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
}
if (alloc_type_ == AllocationType::CUSTOM &&
device_type_id_ != rhs.device_type_id_) {
return device_type_id_ < rhs.device_type_id_;
}
return device < rhs.GetDeviceId();
} }
public: public:
...@@ -206,3 +198,10 @@ class CustomPlace : public Place { ...@@ -206,3 +198,10 @@ class CustomPlace : public Place {
std::ostream& operator<<(std::ostream&, const Place&); std::ostream& operator<<(std::ostream&, const Place&);
} // namespace phi } // namespace phi
namespace paddle {
namespace experimental {
using AllocationType = phi::AllocationType;
using Place = phi::Place;
} // namespace experimental
} // namespace paddle
...@@ -64,6 +64,45 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, ...@@ -64,6 +64,45 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
} }
} }
void ConvTransposeGradInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* dx,
MetaTensor* dfilter) {
GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
}
void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& dout,
const MetaTensor& ddx,
const MetaTensor& ddfilter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* dx,
MetaTensor* dfilter,
MetaTensor* ddout) {
GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
if (ddout) {
ddout->share_meta(dout);
}
}
void GatherNdGradInferMeta(const MetaTensor& x, void GatherNdGradInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
const MetaTensor& out_grad, const MetaTensor& out_grad,
......
...@@ -37,6 +37,37 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, ...@@ -37,6 +37,37 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
MetaTensor* dweight, MetaTensor* dweight,
MetaTensor* dbias); MetaTensor* dbias);
void ConvTransposeGradInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* dx,
MetaTensor* dfilter);
void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& dout,
const MetaTensor& ddx,
const MetaTensor& ddfilter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* dx,
MetaTensor* dfilter,
MetaTensor* ddout);
void GatherNdGradInferMeta(const MetaTensor& x, void GatherNdGradInferMeta(const MetaTensor& x,
const MetaTensor& index, const MetaTensor& index,
const MetaTensor& out_grad, const MetaTensor& out_grad,
......
...@@ -17,8 +17,10 @@ limitations under the License. */ ...@@ -17,8 +17,10 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/cpu/conv_util.h"
...@@ -312,51 +314,6 @@ void CompareAllInferMeta(const MetaTensor& x, ...@@ -312,51 +314,6 @@ void CompareAllInferMeta(const MetaTensor& x,
out->set_dtype(DataType::BOOL); out->set_dtype(DataType::BOOL);
} }
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
MetaTensor* out) {
auto x_dim = x.dims();
auto y_dim = y.dims();
auto dim = axis;
bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
PADDLE_ENFORCE_EQ(
dims_match,
true,
phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
"the 'shape' of Input(Y). But received "
"Input(X).dimensions = [%s], "
"Input(Y).dimensions = [%s]",
x_dim,
y_dim));
if (dim != DDim::kMaxRank) {
PADDLE_ENFORCE_EQ(
dim < x_dim.size() && dim >= (0 - x_dim.size()),
true,
phi::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
x_dim.size(),
x_dim.size() - 1,
dim));
if (dim < 0) {
dim += x_dim.size();
}
PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
true,
phi::errors::InvalidArgument(
"Input(X/Y).dims()[dim] should be equal to 3."
"But received Input(X/Y).dims()[dim] = %d.",
x_dim[dim]));
}
out->set_dims(x_dim);
out->set_dtype(x.dtype());
out->set_layout(x.layout());
out->share_lod(x);
}
void ConvInferMeta(const MetaTensor& input, void ConvInferMeta(const MetaTensor& input,
const MetaTensor& filter, const MetaTensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
...@@ -512,6 +469,241 @@ void ConvInferMeta(const MetaTensor& input, ...@@ -512,6 +469,241 @@ void ConvInferMeta(const MetaTensor& input,
out->set_dtype(input.dtype()); out->set_dtype(input.dtype());
} }
void ConvTransposeInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* out,
MetaConfig config) {
auto x_dims = x.dims();
auto filter_dims = filter.dims();
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
const DataLayout data_layout =
config.is_run_mkldnn_kernel
? DataLayout::kNCHW
: paddle::framework::StringToDataLayout(data_format);
PADDLE_ENFORCE_EQ(
x_dims.size() == 4 || x_dims.size() == 5,
true,
errors::InvalidArgument("Input of Op(conv_transpose) should be 4-D or "
"5-D Tensor. But received: %u-D Tensor, "
"the shape of input is [%s]",
x_dims.size(),
x_dims));
PADDLE_ENFORCE_EQ(
x_dims.size(),
filter_dims.size(),
errors::InvalidArgument(
"The input's dimension size and filter's dimension size of "
"Op (conv_transpose) should be equal. But received: the shape of "
"input is [%s], the dimension size of input is [%d], the shape "
"of filter is [%s], the dimension size of filter is [%d]. ",
x_dims,
x_dims.size(),
filter_dims,
filter_dims.size()));
int stride_size = strides.size();
for (int i = 0; i < stride_size; ++i) {
PADDLE_ENFORCE_GT(
strides[i],
0,
errors::InvalidArgument(
"The stride of Op(Conv) should be larget than 0, but received "
"stride is %d.",
strides[i]));
}
int in_sub_stride_size = x_dims.size() - stride_size;
PADDLE_ENFORCE_EQ(
x_dims.size() - strides.size(),
2U,
errors::InvalidArgument(
"The input's dimension size minus Attr(stride)'s size must "
"be euqal to 2 for Op(conv_transpose). But received: [%d], the "
"input's dimension size is [%d], the shape of input "
"is [%s], the Attr(stride)'s size is [%d].",
in_sub_stride_size,
x_dims.size(),
x_dims,
strides.size()));
if (output_size.size())
PADDLE_ENFORCE_EQ(
output_size.size(),
strides.size(),
errors::InvalidArgument(
"The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
if (output_padding.size())
PADDLE_ENFORCE_EQ(
output_padding.size(),
strides.size(),
errors::InvalidArgument(
"The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
const int64_t C =
(data_layout != DataLayout::kNHWC ? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(
C,
filter_dims[0],
errors::InvalidArgument(
"The number of input channels should be equal to filter channels "
"for Op(conv_transpose). But received: the input's channels is "
"[%d], the shape of input is [%s], the filter's channels is [%d], "
"the shape of filter is [%s]. The data_format is %s."
"The error may come from wrong data_format setting.",
C,
x_dims,
filter_dims[0],
filter_dims,
data_format));
DDim x_data_dims;
if (data_layout != DataLayout::kNHWC) {
x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
std::vector<int64_t> output_shape({x_dims[0]});
if (data_layout != DataLayout::kNHWC) {
output_shape.push_back(filter_dims[1] * groups);
}
const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations_[i] * (filter_dims[i + 2] - 1) + 1;
auto infer_shape = (config.is_runtime || x_dims[i + offset] > 0)
? (x_dims[i + offset] - 1) * strides[i] -
paddings_[2 * i] - paddings_[2 * i + 1] +
filter_extent
: -1;
if (output_size.size()) {
if (config.is_runtime) {
PADDLE_ENFORCE_GE(
output_size[i],
infer_shape,
errors::InvalidArgument(
"output_size of Op(ConvTransposeOp) should not be "
"less than the infered output size. But received output_size = "
"[%s], whose dim %d is less than the infered output size [%s]",
make_ddim(output_size).to_str(),
i,
infer_shape));
PADDLE_ENFORCE_LT(
output_size[i],
infer_shape + strides[i],
errors::InvalidArgument(
"output_size of Op(ConvTransposeOp) should be less "
"than infered size + stride. But received output_size = [%s], "
"whose dim %d is not less than the infered output size (%d) + "
"stride (%d) = %d",
make_ddim(output_size).to_str(),
i,
infer_shape,
strides[i],
infer_shape + strides[i]));
}
output_shape.push_back(output_size[i]);
} else if (output_padding.size()) {
if (config.is_runtime) {
PADDLE_ENFORCE_GE(
output_padding[i],
0,
errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should not be "
"less than the 0. But received output_padding = "
"[%s], whose dim %d is less than 0",
make_ddim(output_padding).to_str(),
i));
PADDLE_ENFORCE_LT(
output_padding[i],
std::max(strides[i], dilations_[i]),
errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should be less "
"than either stride or dilation. But received output_size = "
"[%s], "
"whose dim %d is not less than either stride (%d) or "
"dilation (%d)",
make_ddim(output_size).to_str(),
i,
strides[i],
dilations_[i]));
}
output_shape.push_back((infer_shape + output_padding[i]));
} else {
output_shape.push_back(infer_shape);
}
}
if (data_layout == DataLayout::kNHWC) {
output_shape.push_back(filter_dims[1] * groups);
}
out->set_dims(make_ddim(output_shape));
out->set_dtype(x.dtype());
}
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
MetaTensor* out) {
auto x_dim = x.dims();
auto y_dim = y.dims();
auto dim = axis;
bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
PADDLE_ENFORCE_EQ(
dims_match,
true,
phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
"the 'shape' of Input(Y). But received "
"Input(X).dimensions = [%s], "
"Input(Y).dimensions = [%s]",
x_dim,
y_dim));
if (dim != DDim::kMaxRank) {
PADDLE_ENFORCE_EQ(
dim < x_dim.size() && dim >= (0 - x_dim.size()),
true,
phi::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
x_dim.size(),
x_dim.size() - 1,
dim));
if (dim < 0) {
dim += x_dim.size();
}
PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
true,
phi::errors::InvalidArgument(
"Input(X/Y).dims()[dim] should be equal to 3."
"But received Input(X/Y).dims()[dim] = %d.",
x_dim[dim]));
}
out->set_dims(x_dim);
out->set_dtype(x.dtype());
out->set_layout(x.layout());
out->share_lod(x);
}
void DistInferMeta(const MetaTensor& x, void DistInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
float p, float p,
......
...@@ -83,6 +83,19 @@ void ConvInferMeta(const MetaTensor& input, ...@@ -83,6 +83,19 @@ void ConvInferMeta(const MetaTensor& input,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void ConvTransposeInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* out,
MetaConfig config = MetaConfig());
void CrossInferMeta(const MetaTensor& x, void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
int axis, int axis,
......
...@@ -305,11 +305,48 @@ void BatchNormInferMeta(const MetaTensor& x, ...@@ -305,11 +305,48 @@ void BatchNormInferMeta(const MetaTensor& x,
y->set_dims(x_dims); y->set_dims(x_dims);
mean_out->set_dims({C}); mean_out->set_dims({C});
variance_out->set_dims({C}); variance_out->set_dims({C});
saved_mean->set_dims({C}); if (saved_mean) {
saved_variance->set_dims({C}); saved_mean->set_dims({C});
}
if (saved_variance) {
saved_variance->set_dims({C});
}
y->share_lod(x); y->share_lod(x);
} }
void BatchNormInferInferMeta(const MetaTensor& x,
const MetaTensor& scale,
const MetaTensor& bias,
const MetaTensor& mean,
const MetaTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
MetaTensor* y,
MetaTensor* mean_out,
MetaTensor* variance_out,
MetaConfig config) {
BatchNormInferMeta(x,
scale,
bias,
mean,
variance,
momentum,
epsilon,
data_layout,
/*is_test=*/true,
/*use_global_stats=*/false,
/*trainable_statistics=*/false,
/*fuse_with_relu=*/false,
y,
mean_out,
variance_out,
/*saved_mean=*/nullptr,
/*saved_variance=*/nullptr,
/*reserve_space=*/nullptr,
config);
}
void BilinearTensorProductInferMeta(const MetaTensor& x, void BilinearTensorProductInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
const MetaTensor& weight, const MetaTensor& weight,
...@@ -689,3 +726,4 @@ void WhereInferMeta(const MetaTensor& condition, ...@@ -689,3 +726,4 @@ void WhereInferMeta(const MetaTensor& condition,
} // namespace phi } // namespace phi
PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta);
PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta);
...@@ -92,6 +92,19 @@ void BatchNormInferMeta(const MetaTensor& x, ...@@ -92,6 +92,19 @@ void BatchNormInferMeta(const MetaTensor& x,
MetaTensor* reserve_space, MetaTensor* reserve_space,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void BatchNormInferInferMeta(const MetaTensor& x,
const MetaTensor& scale,
const MetaTensor& bias,
const MetaTensor& mean,
const MetaTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
MetaTensor* y,
MetaTensor* mean_out,
MetaTensor* variance_out,
MetaConfig config = MetaConfig());
void BilinearTensorProductInferMeta(const MetaTensor& x, void BilinearTensorProductInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
const MetaTensor& weight, const MetaTensor& weight,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
namespace phi {
template <typename T, typename Context>
void BatchNormInferKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out) {
// Since saved_mean and saved_variance are used regardless of whether
// they are in test mode, temporary variables need to be created here
// to be compatible
auto saved_mean = phi::EmptyLike<T, Context>(dev_ctx, *mean_out);
auto saved_variance = phi::EmptyLike<T, Context>(dev_ctx, *variance_out);
BatchNormKernel<T, Context>(dev_ctx,
x,
scale,
bias,
mean,
variance,
momentum,
epsilon,
data_layout,
/*is_test=*/true,
/*use_global_stats=*/false,
/*trainable_statistics=*/false,
/*fuse_with_relu=*/false,
y,
mean_out,
variance_out,
&saved_mean,
&saved_variance,
/*reserve_space=*/nullptr);
}
} // namespace phi
PD_REGISTER_KERNEL(batch_norm_infer,
CPU,
ALL_LAYOUT,
phi::BatchNormInferKernel,
float,
double) {}
#ifdef PADDLE_WITH_CUDA
PD_REGISTER_KERNEL(batch_norm_infer,
GPU,
ALL_LAYOUT,
phi::BatchNormInferKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_infer,
GPU,
ALL_LAYOUT,
phi::BatchNormInferKernel,
float,
phi::dtype::float16) {}
#endif
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
...@@ -40,4 +41,18 @@ void BatchNormKernel(const Context& dev_ctx, ...@@ -40,4 +41,18 @@ void BatchNormKernel(const Context& dev_ctx,
DenseTensor* saved_variance, DenseTensor* saved_variance,
DenseTensor* reserve_space); DenseTensor* reserve_space);
template <typename T, typename Context>
void BatchNormInferKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out);
} // namespace phi } // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void Conv2dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter);
template <typename T, typename Context>
void Conv2dTransposeDoubleGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const DenseTensor& ddx,
const DenseTensor& ddfilter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter,
DenseTensor* ddout);
template <typename T, typename Context>
void Conv3dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter);
template <typename T, typename Context>
void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void Conv2dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out);
template <typename T, typename Context>
void Conv3dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out);
template <typename T, typename Context>
void DepthwiseConv2dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
ConvTransposeGradRawKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
dx,
dfilter);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d_transpose_grad,
CPU,
ALL_LAYOUT,
phi::Conv2dTransposeGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d_transpose_grad,
CPU,
ALL_LAYOUT,
phi::Conv3dTransposeGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
CPU,
ALL_LAYOUT,
phi::DepthwiseConv2dTransposeGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConv2dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
ConvTransposeRawKernel<T, Context>(ctx,
x,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d_transpose,
CPU,
ALL_LAYOUT,
phi::Conv2dTransposeKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d_transpose,
CPU,
ALL_LAYOUT,
phi::Conv3dTransposeKernel,
float,
double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
CPU,
ALL_LAYOUT,
phi::DepthwiseConv2dTransposeKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(frobenius_norm_grad,
CPU,
ALL_LAYOUT,
phi::FrobeniusNormGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/frobenius_norm_kernel.h"
#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void FrobeniusNormGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
const std::vector<int64_t>& axis,
bool keep_dim,
bool reduce_all,
DataType in_dtype,
DataType out_dtype,
DenseTensor* dx);
} // namespace phi
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -15,40 +15,16 @@ ...@@ -15,40 +15,16 @@
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h" namespace phi {
namespace paddle { template <typename T, typename Context>
namespace operators { void FrobeniusNormKernel(const Context& ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
// \partial \| X \|_F = \frac{X}{ \| X \|_F } } // namespace phi
template <typename DeviceContext, typename T, typename Functor>
class FrobeniusNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// default use Eigen broadcast
ReduceGradKernel<DeviceContext, T, Functor, false> kernel;
kernel.Compute(context);
}
};
struct FrobeniusNormFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = ((x->square()).sum(dim)).sqrt();
}
};
struct FrobeniusNormGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = y->broadcast(dim);
dx->device(place) = *dx + dx->constant(1e-12f);
dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
}
};
} // namespace operators
} // namespace paddle
...@@ -17,11 +17,39 @@ ...@@ -17,11 +17,39 @@
namespace phi { namespace phi {
namespace funcs { namespace funcs {
//////// Sum Functor /////// //////// Frobenius Norm Functor ///////
struct SumFunctor { struct FrobeniusNormFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim> template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->sum(dim); y->device(place) = ((x->square()).sum(dim)).sqrt();
}
};
struct FrobeniusNormGradFunctor {
template <typename DeviceContext,
typename X,
typename Y,
typename DX,
typename DY,
typename Dim>
void operator()(const DeviceContext& place,
X* x,
Y* y,
DX* dx,
DY* dy,
const Dim& dim,
int size) {
dx->device(place) = y->broadcast(dim);
dx->device(place) = *dx + dx->constant(1e-12f);
dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
}
};
//////// Max Functor ///////
struct MaxFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim);
} }
}; };
...@@ -41,11 +69,11 @@ struct ProdFunctor { ...@@ -41,11 +69,11 @@ struct ProdFunctor {
} }
}; };
//////// Max Functor /////// //////// Sum Functor ///////
struct MaxFunctor { struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim> template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim); y->device(place) = x->sum(dim);
} }
}; };
......
...@@ -123,5 +123,56 @@ DenseTensor Slice(const Context& dev_ctx, ...@@ -123,5 +123,56 @@ DenseTensor Slice(const Context& dev_ctx,
return ret; return ret;
} }
// Use in conv_transpose kernel
template <typename Context, typename T, size_t D>
static void Slice(const Context& ctx,
const DenseTensor* input,
DenseTensor* out,
const std::vector<int64_t>& begin_vec,
const std::vector<int64_t>& end_vec,
const std::vector<int64_t>& axes_vec) {
auto& place = *ctx.eigen_device();
auto in_dims = input->dims();
auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = in_dims[i];
}
std::vector<int64_t> out_shape_vec = vectorize(in_dims);
for (size_t i = 0; i < axes_vec.size(); ++i) {
offsets[axes_vec[i]] = begin_vec[i];
extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
}
DDim out_dims(make_ddim(out_shape_vec));
out->Resize(out_dims);
ctx.template Alloc<T>(out);
auto in_t =
EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, out_dims);
funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
out->Resize(out_dims);
}
template <typename Context, typename T, size_t D>
static void Slice(const Context& ctx,
const DenseTensor* input,
DenseTensor* out,
int64_t begin_idx,
int64_t end_idx,
int64_t axes) {
std::vector<int64_t> begin_vec = {begin_idx};
std::vector<int64_t> end_vec = {end_idx};
std::vector<int64_t> axes_vec = {axes};
Slice<Context, T, D>(ctx, input, out, begin_vec, end_vec, axes_vec);
}
} // namespace funcs } // namespace funcs
} // namespace phi } // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace phi {
template <typename T, typename Context>
void Conv2dTransposeDoubleGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const DenseTensor& ddx,
const DenseTensor& ddfilter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter,
DenseTensor* ddout) {
ConvTransposeGradRawKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
dx,
dfilter);
}
template <typename T, typename Context>
void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
DenseTensor filter_ = filter;
if (!dx && !dfilter) {
return;
}
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
auto x_dims = x.dims();
auto filter_dims = filter_.dims();
DDim in_data_dims;
if (data_layout != DataLayout::kNHWC) {
in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
if (dx) {
paddle::operators::math::DepthwiseConvFunctor<Context, T> depthwiseConv;
depthwiseConv(ctx,
dout,
filter_,
strides,
std::vector<int>{
paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
dilations_,
dx,
data_layout);
}
if (dfilter) {
funcs::SetConstant<Context, T> set_zero;
ctx.template Alloc<T>(dfilter);
set_zero(ctx, dfilter, static_cast<T>(0));
paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(
ctx,
dout,
x,
strides,
std::vector<int>{
paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
dilations_,
dfilter,
data_layout);
}
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d_transpose_grad,
GPU,
ALL_LAYOUT,
phi::Conv2dTransposeGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
GPU,
ALL_LAYOUT,
phi::Conv2dTransposeDoubleGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d_transpose_grad,
GPU,
ALL_LAYOUT,
phi::Conv3dTransposeGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
GPU,
ALL_LAYOUT,
phi::DepthwiseConv2dTransposeGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/depthwise_conv.h"
namespace phi {
template <typename T, typename Context>
void DepthwiseConv2dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out);
PADDLE_ENFORCE_EQ(
groups,
filter_.dims()[0],
errors::InvalidArgument(
"groups should be error to the 1st dimension of filter_. But "
"received groups is %d and filter dimension[0] is %d",
groups,
filter_.dims()[0]));
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
for (auto v : dilations_) {
PADDLE_ENFORCE_EQ(
v,
1,
errors::InvalidArgument("dilations should be 1 in depthwise conv. "
"But received dilations is %d",
v));
}
auto x_dims = x.dims();
auto filter_dims = filter_.dims();
DDim in_data_dims;
if (data_layout != DataLayout::kNHWC) {
in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
ctx.template Alloc<T>(out);
funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, out, static_cast<T>(0));
paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(
ctx,
*out,
filter,
x,
strides,
std::vector<int>{paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
dilations_,
out,
data_layout);
}
} // namespace phi
PD_REGISTER_KERNEL(conv2d_transpose,
GPU,
ALL_LAYOUT,
phi::Conv2dTransposeKernel,
float,
double) {}
PD_REGISTER_KERNEL(conv3d_transpose,
GPU,
ALL_LAYOUT,
phi::Conv3dTransposeKernel,
float,
double) {}
PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
GPU,
ALL_LAYOUT,
phi::DepthwiseConv2dTransposeKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(frobenius_norm_grad,
GPU,
ALL_LAYOUT,
phi::FrobeniusNormGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/frobenius_norm_kernel.h"
#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
#include <algorithm>
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/funcs/slice.h"
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#endif
namespace phi {
using GPUDNNDataLayout = paddle::platform::DataLayout;
template <typename T, typename Context>
void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
const T* filter_data = filter.data<T>();
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ =
dilations; // cudnn v5 does not support dilations
const GPUDNNDataLayout data_layout =
(data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
: GPUDNNDataLayout::kNHWC);
// if channel_last, transpose to channel_first
DenseTensor x_transpose;
DenseTensor dout_transpose;
std::vector<int> x_vec = vectorize<int>(x.dims());
std::vector<int> out_vec = vectorize<int>(dout.dims());
if (data_layout == GPUDNNDataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
x_vec[i] = x.dims()[axis[i]];
out_vec[i] = dout.dims()[axis[i]];
}
x_transpose = Transpose<T, Context>(ctx, x, axis);
dout_transpose = Transpose<T, Context>(ctx, dout, axis);
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 4, 1, 2, 3};
for (size_t i = 0; i < axis.size(); ++i) {
x_vec[i] = x.dims()[axis[i]];
out_vec[i] = dout.dims()[axis[i]];
}
x_transpose = Transpose<T, Context>(ctx, x, axis);
dout_transpose = Transpose<T, Context>(ctx, dout, axis);
}
} else {
x_transpose = x;
dout_transpose = dout;
}
// update padding and dilation
auto x_dims = x_transpose.dims();
auto filter_dims = filter.dims();
DDim x_data_dims;
x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
std::vector<int> x_pad(x_dims.size() * 2, 0);
DenseTensor transformed_dout;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_dout_shape_vec(data_dim + 2);
new_dout_shape_vec[0] = dout_transpose.dims()[0];
new_dout_shape_vec[1] = dout_transpose.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
new_dout_shape_vec[i + 2] =
dout_transpose.dims()[i + 2] + padding_diff[i];
x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
}
transformed_dout.Resize(make_ddim(new_dout_shape_vec));
ctx.template Alloc<T>(&transformed_dout);
const int rank = x_transpose.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(
ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(
ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
} break;
default:
PADDLE_THROW(errors::InvalidArgument(
"Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
}
} else {
transformed_dout = dout_transpose;
if (paddings_.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[2 * i];
}
}
}
const T* x_data = x_transpose.data<T>();
const T* dout_data = transformed_dout.data<T>();
out_vec = vectorize<int>(transformed_dout.dims());
// ------------------- cudnn descriptors ---------------------
GPUDNNDataLayout layout;
if (strides.size() == 2U) {
layout = GPUDNNDataLayout::kNCHW;
} else {
layout = GPUDNNDataLayout::kNCDHW;
}
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
auto dtype = paddle::platform::CudnnDataType<T>::type;
paddle::operators::ConvArgs args1{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype};
paddle::operators::ConvArgs args2{&transformed_dout,
&filter,
&x_transpose,
strides,
padding_common,
dilations_,
dtype};
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t data_algo{};
miopenConvBwdWeightsAlgorithm_t filter_algo{};
#else
cudnnConvolutionFwdAlgo_t data_algo{};
cudnnConvolutionBwdFilterAlgo_t filter_algo{};
#endif
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
size_t workspace_size = 0;
auto handle = ctx.cudnn_handle();
bool deterministic = FLAGS_cudnn_deterministic;
T* dx_data = nullptr;
T* dfilter_data = nullptr;
if (dx) {
dx_data = ctx.template Alloc<T>(dx);
args1.handle = handle;
args1.idesc.set(transformed_dout, iwo_groups);
args1.wdesc.set(filter, layout_tensor, iwo_groups);
args1.odesc.set(x_transpose, iwo_groups);
args1.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif
}
if (dfilter) {
dfilter_data = ctx.template Alloc<T>(dfilter);
args2.handle = handle;
args2.idesc.set(transformed_dout, iwo_groups);
args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
args2.odesc.set(x_transpose, iwo_groups);
args2.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo));
#endif
}
// ------------------- cudnn conv backward data ---------------------
// FIxME(typhoonzero): template type T may not be the same as cudnn call.
int x_offset = x.numel() / x.dims()[0] / groups;
int dout_offset =
transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
if (dx) {
// Because beta is zero, it is unnecessary to reset dx.
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenConvolutionForward(handle,
&alpha,
args1.idesc.desc(),
dout_data + dout_offset * g,
args1.wdesc.desc(),
filter_data + filter_offset * g,
args1.cdesc.desc(),
data_algo,
&beta,
args1.odesc.desc(),
dx_data + x_offset * g,
cudnn_workspace,
workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cudnnConvolutionForward(handle,
&alpha,
args1.idesc.desc(),
dout_data + dout_offset * g,
args1.wdesc.desc(),
filter_data + filter_offset * g,
args1.cdesc.desc(),
data_algo,
cudnn_workspace,
workspace_size,
&beta,
args1.odesc.desc(),
dx_data + x_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (data_layout == GPUDNNDataLayout::kNHWC) {
DenseTensor dx_transpose;
DenseTensor dx_nchw;
dx_nchw.ShareDataWith(*dx);
dx_nchw.Resize(make_ddim(x_vec));
if (strides.size() == 2U) {
std::vector<int> axis = {0, 2, 3, 1};
dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
*dx = dx_transpose;
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 2, 3, 4, 1};
dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
*dx = dx_transpose;
}
}
}
// ------------------- cudnn conv backward filter ---------------------
if (dfilter) {
// Because beta is zero, it is unnecessary to reset dfilter.
// Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args2.odesc.desc(),
x_data + x_offset * g,
args2.idesc.desc(),
dout_data + dout_offset * g,
args2.cdesc.desc(),
filter_algo,
&beta,
args2.wdesc.desc(),
dfilter_data + filter_offset * g,
cudnn_workspace,
workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args2.idesc.desc(),
dout_data + dout_offset * g,
args2.odesc.desc(),
x_data + x_offset * g,
args2.cdesc.desc(),
filter_algo,
cudnn_workspace,
workspace_size,
&beta,
args2.wdesc.desc(),
dfilter_data + filter_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
}
}
template <typename T, typename Context>
void Conv2dTransposeGradGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings_,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings_,
padding_algorithm,
groups,
dilations_,
data_format,
dx,
dfilter);
}
/*
* Inputs: I, filter, dout, ddI, ddfilter
* Outputs: ddout, dfilter, dI
* ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
* dfilter = conv_bp_filter(dout, ddI)
* dI = conv(dout, ddfilter)
*/
template <typename T, typename Context>
void Conv2dTransposeDoubleGradGPUDNNKernel(
const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const DenseTensor& ddx,
const DenseTensor& ddfilter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter,
DenseTensor* ddout) {
if (dx) {
ctx.template Alloc<T>(dx);
}
if (dfilter) {
ctx.template Alloc<T>(dfilter);
}
if (ddout) {
ctx.template Alloc<T>(ddout);
funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, ddout, static_cast<T>(0));
}
const T* filter_ = filter.data<T>();
const T* dout_ = dout.data<T>();
const T* ddx_ = nullptr;
const T* ddfilter_ = nullptr;
T* dx_ = nullptr;
T* dfilter_ = nullptr;
T* ddout_ = nullptr;
T* transformed_dx_ = nullptr;
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
bool deterministic = FLAGS_cudnn_deterministic;
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
// transform DenseTensors to channel first-----------
DenseTensor transformed_x_channel(x.type());
DenseTensor transformed_dout_channel(dout.type());
DenseTensor transformed_ddx_channel(x.type());
DenseTensor transformed_dx_channel(x.type());
DenseTensor transformed_ddout_channel(dout.type());
if (channel_last) {
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
ResizeToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
TransToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
ResizeToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
TransToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
if (dx) {
ResizeToChannelFirst<Context, T>(ctx, dx, &transformed_dx_channel);
ctx.template Alloc<T>(&transformed_dx_channel);
}
if (ddout) {
ResizeToChannelFirst<Context, T>(ctx, ddout, &transformed_ddout_channel);
}
} else {
transformed_x_channel = x;
transformed_dout_channel = dout;
transformed_ddx_channel = ddx;
if (dx) {
transformed_dx_channel = *dx;
}
}
std::vector<int> out_vec = vectorize<int>(transformed_dout_channel.dims());
auto x_dims = transformed_x_channel.dims();
auto filter_dims = filter.dims();
DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
DenseTensor transformed_x(x.type());
DenseTensor transformed_ddx(x.type());
DenseTensor transformed_dout(dout.type());
std::vector<int> padding_common(data_dim, 0);
std::vector<int> input_pad(x.dims().size() * 2, 0);
if (!is_sys_pad) {
// get pad
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
std::vector<int> new_output_grad_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_x_channel.dims()[0];
new_input_shape_vec[1] = transformed_x_channel.dims()[1];
new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_x_channel.dims()[i + 2] + padding_diff[i];
new_output_grad_shape_vec[i + 2] =
transformed_dout_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
}
DDim new_input_shape(make_ddim(new_input_shape_vec));
transformed_x.Resize(new_input_shape);
transformed_ddx.Resize(new_input_shape);
transformed_dout.Resize(make_ddim(new_output_grad_shape_vec));
ctx.template Alloc<T>(&transformed_x);
ctx.template Alloc<T>(&transformed_ddx);
ctx.template Alloc<T>(&transformed_dout);
// pad for input
const int rank = x.dims().size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(
ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_dout_channel,
pad_value,
&transformed_dout);
funcs::PadFunction<Context, T, 4>(ctx,
input_pad,
transformed_ddx_channel,
pad_value,
&transformed_ddx);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(
ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
funcs::PadFunction<Context, T, 5>(ctx,
input_pad,
transformed_ddx_channel,
pad_value,
&transformed_ddx);
} break;
default:
PADDLE_THROW(errors::InvalidArgument(
"ConvOp only support tensors with 4 or 5 dimensions."));
}
} else {
transformed_x = transformed_x_channel;
transformed_dout = transformed_dout_channel;
transformed_ddx = transformed_ddx_channel;
if (paddings_.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[2 * i];
}
}
}
std::vector<int64_t> starts(data_dim, 0);
std::vector<int64_t> ends(data_dim, 0);
std::vector<int64_t> axes(data_dim, 0);
for (size_t i = 0; i < data_dim; ++i) {
starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
ends[i] = starts[i] + out_vec[i + 2];
axes[i] = i + 2;
}
std::vector<int> transformed_out_vec = out_vec;
for (size_t i = 0; i < data_dim; ++i) {
transformed_out_vec[i + 2] =
out_vec[i + 2] +
(input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
}
if (!is_sys_pad) {
transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
ctx.template Alloc<T>(&transformed_ddout_channel);
} else {
ctx.template Alloc<T>(ddout);
transformed_ddout_channel = *ddout;
transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
}
const T* x_ = transformed_x.data<T>();
int iwo_group = groups;
int c_group = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_group = 1;
c_group = groups;
groups = 1;
#endif
auto dtype = paddle::platform::CudnnDataType<T>::type;
auto handle = ctx.cudnn_handle();
paddle::operators::ConvArgs args1{&transformed_ddout_channel,
&filter,
&transformed_ddx,
strides,
padding_common,
dilations_,
dtype};
paddle::operators::ConvArgs args2{&transformed_ddout_channel,
&ddfilter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype};
paddle::operators::ConvArgs args3{&transformed_dout,
dfilter,
&transformed_ddx_channel,
strides,
padding_common,
dilations_,
dtype};
paddle::operators::ConvArgs args4{&transformed_dout,
&ddfilter,
&transformed_dx_channel,
strides,
padding_common,
dilations_,
dtype};
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t bwd_algo1 =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvBwdDataAlgorithm_t bwd_algo2 =
static_cast<miopenConvBwdDataAlgorithm_t>(0);
miopenConvFwdAlgorithm_t data_algo = static_cast<miopenConvFwdAlgorithm_t>(0);
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else
cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionFwdAlgo_t data_algo =
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
// ddo = conv(ddI, filter) + conv(I, ddfilter)
size_t workspace_size = 0;
T* transformed_ddout_channel_ = nullptr;
if (ddout) {
ddout_ = ddout->data<T>();
transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_ddout_channel, iwo_group);
args1.wdesc.set(filter, layout, iwo_group);
args1.odesc.set(transformed_ddx, iwo_group);
args1.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search1 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
#endif
ddfilter_ = ddfilter.data<T>();
args2.handle = handle;
args2.idesc.set(transformed_ddout_channel, iwo_group);
args2.wdesc.set(ddfilter, layout, iwo_group);
args2.odesc.set(transformed_x, iwo_group);
args2.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2));
#endif
}
if (dfilter) {
dfilter_ = dfilter->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_dout, iwo_group);
args3.wdesc.set(*dfilter, layout, iwo_group);
args3.odesc.set(transformed_ddx_channel, iwo_group);
args3.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
#endif
}
if (dx) {
transformed_dx_ = transformed_dx_channel.data<T>();
args4.handle = handle;
args4.idesc.set(transformed_dout, iwo_group);
args4.wdesc.set(ddfilter, layout, iwo_group);
args4.odesc.set(transformed_dx_channel, iwo_group);
args4.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search4 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search4::Find<T>(args4, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
}
int i_n, i_c, i_d, i_h, i_w;
paddle::operators::GetNCDHW(transformed_x.dims(),
GPUDNNDataLayout::kNCHW,
&i_n,
&i_c,
&i_d,
&i_h,
&i_w);
int o_n, o_c, o_d, o_h, o_w;
paddle::operators::GetNCDHW(transformed_dout.dims(),
GPUDNNDataLayout::kNCHW,
&o_n,
&o_c,
&o_d,
&o_h,
&o_w);
int group_offset_in =
transformed_x.numel() / transformed_x.dims()[0] / groups;
int group_offset_out =
transformed_dout.numel() / transformed_dout.dims()[0] / groups;
int group_offset_filter = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
auto wkspace_handle = ctx.cudnn_workspace_handle();
if (ddout) {
ddx_ = transformed_ddx.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args1.odesc.desc(),
ddx_ + i * group_offset_in,
args1.wdesc.desc(),
filter_ + i * group_offset_filter,
args1.cdesc.desc(),
bwd_algo1,
&beta,
args1.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out,
workspace_ptr,
workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args1.wdesc.desc(),
filter_ + i * group_offset_filter,
args1.odesc.desc(),
ddx_ + i * group_offset_in,
args1.cdesc.desc(),
bwd_algo1,
workspace_ptr,
workspace_size,
&beta,
args1.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
DenseTensor conv_x_ddfilter(dout.type());
conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
T* conv_x_ddfilter_data = ctx.template Alloc<T>(&conv_x_ddfilter);
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args2.odesc.desc(),
x_ + i * group_offset_in,
args2.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args2.cdesc.desc(),
bwd_algo2,
&beta,
args2.idesc.desc(),
conv_x_ddfilter_data + i * group_offset_out,
workspace_ptr,
workspace_size));
},
workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
handle,
miopenTensorOpAdd,
&alpha,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out,
&alpha,
args2.idesc.desc(),
conv_x_ddfilter_data + i * group_offset_out,
&beta,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args2.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args2.odesc.desc(),
x_ + i * group_offset_in,
args2.cdesc.desc(),
bwd_algo2,
workspace_ptr,
workspace_size,
&alpha,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
if ((!is_sys_pad) && (!channel_last)) {
if (strides.size() == 2U) {
funcs::Slice<Context, T, 4>(
ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
funcs::Slice<Context, T, 5>(
ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
}
} else if ((!is_sys_pad) && (channel_last)) {
if (strides.size() == 2U) {
funcs::Slice<Context, T, 4>(ctx,
&transformed_ddout_channel,
&transformed_ddout_channel,
starts,
ends,
axes);
} else if (!is_sys_pad && strides.size() == 3U) {
funcs::Slice<Context, T, 5>(ctx,
&transformed_ddout_channel,
&transformed_ddout_channel,
starts,
ends,
axes);
}
TransToChannelLast<Context, T>(ctx, &transformed_ddout_channel, ddout);
}
}
T* transformed_dout_channel_ = transformed_dout.data<T>();
if (dfilter) {
ddx_ = transformed_ddx_channel.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args3.odesc.desc(),
ddx_ + i * group_offset_in,
args3.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args3.cdesc.desc(),
filter_algo,
&beta,
args3.wdesc.desc(),
dfilter_ + i * group_offset_filter,
workspace_ptr,
workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args3.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args3.odesc.desc(),
ddx_ + i * group_offset_in,
args3.cdesc.desc(),
filter_algo,
workspace_ptr,
workspace_size,
&beta,
args3.wdesc.desc(),
dfilter_ + i * group_offset_filter));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
}
if (dx) {
ddfilter_ = ddfilter.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
handle,
&alpha,
args4.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args4.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args4.cdesc.desc(),
data_algo,
&beta,
args4.odesc.desc(),
transformed_dx_ + i * group_offset_in,
workspace_ptr,
workspace_size));
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
handle,
&alpha,
args4.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args4.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args4.cdesc.desc(),
data_algo,
workspace_ptr,
workspace_size,
&beta,
args4.odesc.desc(),
transformed_dx_ + i * group_offset_in));
},
workspace_size);
#endif // PADDLE_WITH_HIP
}
if (channel_last) {
TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
}
}
}
template <typename T, typename Context>
void Conv3dTransposeGradGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings_,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings_,
padding_algorithm,
groups,
dilations_,
data_format,
dx,
dfilter);
}
} // namespace phi
using float16 = phi::dtype::float16;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL(conv2d_transpose_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeGradGPUDNNKernel,
float,
float16) {}
PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeDoubleGradGPUDNNKernel,
float,
float16) {}
PD_REGISTER_KERNEL(conv3d_transpose_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3dTransposeGradGPUDNNKernel,
float,
float16) {}
#else
PD_REGISTER_KERNEL(conv2d_transpose_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeGradGPUDNNKernel,
float,
double,
float16) {}
PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeDoubleGradGPUDNNKernel,
float,
double,
float16) {}
PD_REGISTER_KERNEL(conv3d_transpose_grad,
GPUDNN,
ALL_LAYOUT,
phi::Conv3dTransposeGradGPUDNNKernel,
float,
double,
float16) {}
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include <algorithm>
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/funcs/slice.h"
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#endif
namespace phi {
using GPUDNNDataLayout = paddle::platform::DataLayout;
template <typename T, typename Context>
void ConvTransposeRawGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ =
dilations; // cudnn v5 does not support dilations
const T* filter_data = filter.data<T>();
const GPUDNNDataLayout data_layout =
(data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
: GPUDNNDataLayout::kNHWC);
std::vector<int> x_vec = vectorize<int>(x.dims());
std::vector<int> out_vec = vectorize<int>(out->dims());
// if channel_last, transpose to channel_first
DenseTensor x_transpose;
if (data_layout == GPUDNNDataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
x_vec[i] = x.dims()[axis[i]];
out_vec[i] = out->dims()[axis[i]];
}
x_transpose = Transpose<T, Context>(ctx, x, axis);
} else if (strides.size() == 3U) {
std::vector<int> axis = {0, 4, 1, 2, 3};
for (size_t i = 0; i < axis.size(); ++i) {
x_vec[i] = x.dims()[axis[i]];
out_vec[i] = out->dims()[axis[i]];
}
x_transpose = Transpose<T, Context>(ctx, x, axis);
}
} else {
x_transpose = x;
}
// update padding and dilation
auto x_dims = x_transpose.dims();
auto filter_dims = filter.dims();
DDim x_data_dims;
x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
std::vector<int> x_pad(x_dims.size() * 2, 0);
DenseTensor transformed_x;
std::vector<int> padding_common(data_dim, 0);
if (!is_sys_pad) {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_x_shape_vec(data_dim + 2);
new_x_shape_vec[0] = x_dims[0];
new_x_shape_vec[1] = x_dims[1];
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
new_x_shape_vec[i + 2] = x_dims[i + 2] + padding_diff[i];
x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
}
DDim new_x_shape(make_ddim(new_x_shape_vec));
transformed_x.Resize(new_x_shape);
ctx.template Alloc<T>(&transformed_x);
const int rank = x_dims.size();
T pad_value(0.0);
switch (rank) {
case 4: {
funcs::PadFunction<Context, T, 4>(
ctx, x_pad, x_transpose, pad_value, &transformed_x);
} break;
case 5: {
funcs::PadFunction<Context, T, 5>(
ctx, x_pad, x_transpose, pad_value, &transformed_x);
} break;
default:
PADDLE_THROW(errors::InvalidArgument(
"Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
}
} else {
transformed_x = x_transpose;
if (paddings_.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[i];
}
} else {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings_[2 * i];
}
}
}
std::vector<int64_t> starts(data_dim, 0);
std::vector<int64_t> ends(data_dim, 0);
std::vector<int64_t> axes(data_dim, 0);
for (size_t i = 0; i < data_dim; ++i) {
starts[i] = x_pad[2 * i + 4] * (strides[i] + 1);
ends[i] = starts[i] + out_vec[i + 2];
axes[i] = i + 2;
}
const T* x_data = transformed_x.data<T>();
x_vec = vectorize<int>(transformed_x.dims());
std::vector<int> transformed_out_vec = out_vec;
for (size_t i = 0; i < data_dim; ++i) {
transformed_out_vec[i + 2] =
out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] -
2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
}
DenseTensor transformed_out;
if (!is_sys_pad) {
transformed_out.Resize(make_ddim(transformed_out_vec));
ctx.template Alloc<T>(&transformed_out);
} else {
ctx.template Alloc<T>(out);
transformed_out.ShareDataWith(*out);
transformed_out.Resize(make_ddim(transformed_out_vec));
}
T* transformed_out_data = transformed_out.data<T>();
GPUDNNDataLayout layout;
int iwo_groups = groups;
int c_groups = 1;
#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
if (strides.size() == 2U) {
layout = GPUDNNDataLayout::kNCHW;
} else {
layout = GPUDNNDataLayout::kNCDHW;
}
size_t workspace_size = 0;
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t algo{};
#else
cudnnConvolutionBwdDataAlgo_t algo{};
#endif
// ------------------- cudnn conv algorithm ---------------------
auto handle = ctx.cudnn_handle();
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
bool deterministic = FLAGS_cudnn_deterministic;
auto dtype = paddle::platform::CudnnDataType<T>::type;
// ------------------- cudnn descriptors ---------------------
paddle::operators::ConvArgs args{&transformed_out,
&filter,
&transformed_x,
strides,
padding_common,
dilations_,
dtype};
args.handle = handle;
args.idesc.set(transformed_out, iwo_groups);
args.wdesc.set(filter, layout_tensor, iwo_groups);
args.odesc.set(transformed_x, iwo_groups);
args.cdesc.set(dtype,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(args, false, deterministic, ctx);
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo));
#endif
// ------------------- cudnn conv transpose forward ---------------------
int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
int filter_offset = filter.numel() / groups;
paddle::operators::ScalingParamType<T> alpha = 1.0f;
paddle::operators::ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args.odesc.desc(),
x_data + x_offset * g,
args.wdesc.desc(),
filter_data + filter_offset * g,
args.cdesc.desc(),
algo,
&beta,
args.idesc.desc(),
transformed_out_data + out_offset * g,
cudnn_workspace,
workspace_size));
};
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args.wdesc.desc(),
filter_data + filter_offset * g,
args.odesc.desc(),
x_data + x_offset * g,
args.cdesc.desc(),
algo,
cudnn_workspace,
workspace_size,
&beta,
args.idesc.desc(),
transformed_out_data + out_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (!is_sys_pad && strides.size() == 2U) {
funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
funcs::Slice<Context, T, 5>(ctx, &transformed_out, out, starts, ends, axes);
}
if (data_layout == GPUDNNDataLayout::kNHWC) {
DenseTensor out_transpose;
DenseTensor out_nchw;
out_nchw.ShareDataWith(*out);
out_nchw.Resize(make_ddim(out_vec));
if (strides.size() == 2U) {
out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 1});
} else if (strides.size() == 3U) {
out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 4, 1});
}
*out = out_transpose;
}
}
template <typename T, typename Context>
void Conv2dTransposeGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
x,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
out);
}
template <typename T, typename Context>
void Conv3dTransposeGPUDNNKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
x,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
out);
}
} // namespace phi
using float16 = phi::dtype::float16;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL(conv2d_transpose,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeGPUDNNKernel,
float,
float16) {}
PD_REGISTER_KERNEL(conv3d_transpose,
GPUDNN,
ALL_LAYOUT,
phi::Conv3dTransposeGPUDNNKernel,
float,
float16) {}
#else
PD_REGISTER_KERNEL(conv2d_transpose,
GPUDNN,
ALL_LAYOUT,
phi::Conv2dTransposeGPUDNNKernel,
float,
double,
float16) {}
PD_REGISTER_KERNEL(conv3d_transpose,
GPUDNN,
ALL_LAYOUT,
phi::Conv3dTransposeGPUDNNKernel,
float,
double,
float16) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/slice.h"
namespace phi {
template <typename T, typename Context>
void ConvTransposeGradRawKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
// For filter, we do not use const pointer because we will do reshape,
// but we should avoid modifying its value.
DenseTensor filter_ = filter;
if ((!dx) && (!dfilter)) {
return;
}
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
auto x_dims = x.dims();
auto filter_dims = filter_.dims();
auto dout_dims = dout.dims();
const int batch_size = static_cast<int>(x.dims()[0]);
DDim in_data_dims;
if (data_layout != DataLayout::kNHWC) {
in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
// x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
// x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
std::vector<int64_t> x_shape_vec = vectorize(x.dims());
// filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
if (data_layout != DataLayout::kNHWC) {
col_shape_vec[0] = dout_dims[1];
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
}
} else {
col_shape_vec[0] = dout_dims[dout_dims.size() - 1];
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
}
}
DDim col_shape(make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
// output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
DDim output_shape = slice_ddim(dout.dims(), 1, dout.dims().size());
// x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
// x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
DDim x_matrix_shape;
if (data_layout != DataLayout::kNHWC) {
x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
} else {
x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
}
// filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
DDim filter_matrix_shape;
if (data_layout != DataLayout::kNHWC) {
filter_matrix_shape = {x_dims[1], col_matrix_shape[0] / groups};
} else {
filter_matrix_shape = {x_dims[x_dims.size() - 1],
col_matrix_shape[0] / groups};
}
filter_.Resize(filter_matrix_shape);
int in_step = (data_layout != DataLayout::kNHWC
? static_cast<int>(x_dims[1]) / groups
: static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
// convolution transpose grad on x:
// im2col + gemm (similar to conv-forward)
// x need to compute gradient
auto blas = funcs::GetBlas<Context, T>(ctx);
if (dx || dfilter) {
DenseTensor col;
col.Resize(col_shape);
ctx.template Alloc<T>(&col);
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DenseTensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
DenseTensor dfilter_;
funcs::SetConstant<Context, T> set_zero;
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
im2col;
paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
funcs::ConcatFunctor<Context, T> concat_functor;
if (dx) {
ctx.template Alloc<T>(dx);
set_zero(ctx, dx, static_cast<T>(0));
}
if (dfilter) { // dfilter_ size (i_c, o_c/g, k_h, k_w)
ctx.template Alloc<T>(dfilter);
set_zero(ctx, dfilter, static_cast<T>(0));
dfilter_ = *dfilter;
dfilter_.Resize(filter_matrix_shape);
}
size_t D = x.dims().size();
for (int i = 0; i < batch_size; i++) {
// batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
// channel_first
// batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
// channel_last
DenseTensor dout_batch = dout.Slice(i, i + 1).Resize(output_shape);
if (data_dim == 2U) {
// im2col: dy -> col matrix
// from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
// channel_first
// from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
// channel_last
im2col(ctx,
dout_batch,
dilations_,
strides,
std::vector<int>{
paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
&col,
data_layout);
} else if (data_dim == 3U) {
// vol2col: dy -> col_matrix
// from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
// i_w) for channel_first
// from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
// k_w) for channel_last
vol2col(
ctx, dout_batch, dilations_, strides, paddings_, &col, data_layout);
}
if (dx) {
// batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
DenseTensor dx_batch = dx->Slice(i, i + 1).Resize(x_matrix_shape);
// gemm: dx = filter * dy
// (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
// * i_w)
// or
// (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
// i_w) -> (i_c,
// i_d, i_h, i_w)
// gemm: dx = dy^T * filter^T for channel_last
std::vector<DenseTensor> dx_batch_vec;
for (int g = 0; g < groups; g++) {
// dx_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
// for channel_first
// dx_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
// for channel_last
// filter_slice: (i_c/g, o_c/g * k_h * k_w)
DenseTensor filter_slice =
filter_.Slice(g * in_step, (g + 1) * in_step);
// col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
// k_h * k_w, d * h * w)
DenseTensor col_matrix_slice =
col_matrix.Slice(g * col_step, (g + 1) * col_step);
if (data_layout != DataLayout::kNHWC) {
DenseTensor dx_slice =
dx_batch.Slice(g * in_step, (g + 1) * in_step);
blas.MatMul(filter_slice,
false,
col_matrix_slice,
false,
static_cast<T>(1.0),
&dx_slice,
static_cast<T>(0.0));
} else {
DenseTensor dx_slice;
funcs::Slice<Context, T, 2>(
ctx, &dx_batch, &dx_slice, g * in_step, (g + 1) * in_step, 1);
blas.MatMul(col_matrix_slice,
true,
filter_slice,
true,
static_cast<T>(1.0),
&dx_slice,
static_cast<T>(0.0));
DDim dx_slice_shape;
if (data_dim == 2U) {
dx_slice_shape = {x_dims[1], x_dims[2], in_step};
} else {
dx_slice_shape = {x_dims[1], x_dims[2], x_dims[3], in_step};
}
dx_slice = dx_slice.Resize(dx_slice_shape);
dx_batch_vec.push_back(dx_slice);
}
}
if (data_layout == DataLayout::kNHWC) {
concat_functor(ctx, dx_batch_vec, static_cast<int>(D - 2), &dx_batch);
}
}
if (dfilter) {
// x batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
// gemm: d_filter = x * dy^T
// (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
// * k_w)
// or
// (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
// -> (i_c, o_c * k_d *
// k_h * k_w)
// gemm: d_filter = x^T * dy^T for channel_last
for (int g = 0; g < groups; g++) {
DenseTensor dfilter_slice =
dfilter_.Slice(g * in_step, (g + 1) * in_step);
DenseTensor col_matrix_slice =
col_matrix.Slice(g * col_step, (g + 1) * col_step);
if (data_layout != DataLayout::kNHWC) {
DenseTensor in_batch_slice =
in_batch.Slice(g * in_step, (g + 1) * in_step);
blas.MatMul(in_batch_slice,
false,
col_matrix_slice,
true,
static_cast<T>(1.0),
&dfilter_slice,
static_cast<T>(1.0));
} else {
DenseTensor in_batch_slice;
funcs::Slice<Context, T, 2>(ctx,
&in_batch,
&in_batch_slice,
g * in_step,
(g + 1) * in_step,
1);
blas.MatMul(in_batch_slice,
true,
col_matrix_slice,
true,
static_cast<T>(1.0),
&dfilter_slice,
static_cast<T>(1.0));
}
}
}
}
}
}
template <typename T, typename Context>
void Conv2dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
ConvTransposeGradRawKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
dx,
dfilter);
}
template <typename T, typename Context>
void Conv3dTransposeGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const DenseTensor& dout,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* dx,
DenseTensor* dfilter) {
ConvTransposeGradRawKernel<T, Context>(ctx,
x,
filter,
dout,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
dx,
dfilter);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/slice.h"
namespace phi {
template <typename T, typename Context>
void ConvTransposeRawKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_format);
// The filter will be reshaped, so it should not be constant
DenseTensor filter_ = filter;
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
auto x_dims = x.dims();
auto filter_dims = filter_.dims();
auto out_dims = out->dims();
const int batch_size = static_cast<int>(x.dims()[0]);
DDim in_data_dims;
if (data_layout != DataLayout::kNHWC) {
in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
// x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
// x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
std::vector<int64_t> x_shape_vec = vectorize(x.dims());
// filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
if (data_layout != DataLayout::kNHWC) {
col_shape_vec[0] = out_dims[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
}
} else {
col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
}
}
DDim col_shape(make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
// size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
DenseTensor col;
col.Resize(col_shape);
ctx.template Alloc<T>(&col);
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DenseTensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
// out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size());
// x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
// x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
DDim x_matrix_shape;
if (data_layout != DataLayout::kNHWC) {
x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
} else {
x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
}
// filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
DDim filter_matrix_shape;
if (data_layout != DataLayout::kNHWC) {
filter_matrix_shape = {x_dims[1], col_matrix_shape[0]};
} else {
filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]};
}
filter_.Resize(filter_matrix_shape);
ctx.template Alloc<T>(out);
funcs::SetConstant<Context, T> set_zero;
auto blas = funcs::GetBlas<Context, T>(ctx);
set_zero(ctx, out, static_cast<T>(0));
int in_step = (data_layout != DataLayout::kNHWC
? static_cast<int>(x_dims[1]) / groups
: static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
int out_step =
(data_layout != DataLayout::kNHWC
? static_cast<int>(out_dims[1]) / groups
: static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
paddle::operators::math::
Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
col2im;
paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
funcs::ConcatFunctor<Context, T> concat_functor;
// convolution transpose: gemm + col2im or col2vol (similar to conv-backward
// on x)
size_t D = x.dims().size();
for (int i = 0; i < batch_size; i++) {
// batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
// batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
// out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
// out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape);
std::vector<DenseTensor> out_batch_vec;
for (int g = 0; g < groups; g++) {
int64_t start = g * in_step;
int64_t end = (g + 1) * in_step;
int axes = (data_layout != DataLayout::kNHWC ? 0 : 1);
DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step);
DenseTensor in_slice, out_slice;
// col_matrix = filter_slice * x_slice
// of shape (o_c/g * k_h * k_w, h * w)
// or (o_c/g * k_d * k_h * k_w, d * h * w)
if (data_layout != DataLayout::kNHWC) {
in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(filter_slice,
true,
in_slice,
false,
static_cast<T>(1.0),
&col_matrix,
static_cast<T>(0.0));
} else {
funcs::Slice<Context, T, 2>(ctx, &x_batch, &in_slice, start, end, axes);
start = g * out_step;
end = (g + 1) * out_step;
axes = D - 2;
if (D == 4U) {
funcs::Slice<Context, T, 3>(
ctx, &out_batch, &out_slice, start, end, axes);
} else if (D == 5U) {
funcs::Slice<Context, T, 4>(
ctx, &out_batch, &out_slice, start, end, axes);
}
blas.MatMul(filter_slice,
true,
in_slice,
true,
static_cast<T>(1.0),
&col_matrix,
static_cast<T>(0.0));
}
if (data_dim == 2U) {
// col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g,
// o_h, o_w) or (o_h, o_w, o_c/g)
col2im(ctx,
col,
dilations_,
strides,
std::vector<int>{
paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
&out_slice,
data_layout);
} else if (data_dim == 3U) {
// col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w)
// to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g)
col2vol(
ctx, col, dilations_, strides, paddings_, &out_slice, data_layout);
}
if (data_layout == DataLayout::kNHWC) {
out_batch_vec.push_back(out_slice);
}
}
if (data_layout == DataLayout::kNHWC) {
concat_functor(ctx, out_batch_vec, static_cast<int>(D - 2), &out_batch);
}
}
}
template <typename T, typename Context>
void Conv2dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
ConvTransposeRawKernel<T, Context>(ctx,
x,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
out);
}
template <typename T, typename Context>
void Conv3dTransposeKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
ConvTransposeRawKernel<T, Context>(ctx,
x,
filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
data_format,
out);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
#include "paddle/phi/kernels/impl/reduce_grad.h"
namespace phi {
template <typename T, typename Context>
void FrobeniusNormGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
const std::vector<int64_t>& axis,
bool keep_dim,
bool reduce_all,
DataType in_dtype,
DataType out_dtype,
DenseTensor* dx) {
ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/frobenius_norm_kernel.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
namespace phi {
template <typename T, typename Context>
void FrobeniusNormKernel(const Context& ctx,
const DenseTensor& x,
const std::vector<int64_t>& axis,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
Reduce<Context, T, funcs::FrobeniusNormFunctor>(
ctx, x, reduce_all, axis, keep_dim, x.dtype(), out);
}
} // namespace phi
...@@ -17,21 +17,35 @@ ...@@ -17,21 +17,35 @@
namespace phi { namespace phi {
KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm", bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
{"X", "Scale", "Bias", "Mean", "Variance"}, bool use_global_stats = paddle::any_cast<bool>(ctx.Attr("use_global_stats"));
{"momentum", bool trainable_statistics =
"epsilon", paddle::any_cast<bool>(ctx.Attr("trainable_statistics"));
"data_layout", bool fuse_with_relu = paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"));
"is_test", // Dispenable `MomentumTensor` is useless now
"use_global_stats", if (is_test && !use_global_stats && !trainable_statistics &&
"trainable_statistics", !fuse_with_relu) {
"fuse_with_relu"}, return KernelSignature("batch_norm_infer",
{"Y", {"X", "Scale", "Bias", "Mean", "Variance"},
"MeanOut", {"momentum", "epsilon", "data_layout"},
"VarianceOut", {"Y", "MeanOut", "VarianceOut"});
"SavedMean", } else {
"SavedVariance", return KernelSignature("batch_norm",
"ReserveSpace"}); {"X", "Scale", "Bias", "Mean", "Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"Y",
"MeanOut",
"VarianceOut",
"SavedMean",
"SavedVariance",
"ReserveSpace"});
}
} }
KernelSignature BatchNormGradOpArgumentMapping( KernelSignature BatchNormGradOpArgumentMapping(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature Conv2dTransposeOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_transpose",
{"Input", "Filter"},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{"Output"});
}
KernelSignature Conv2dTransposeGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_transpose_grad",
{"Input", "Filter", GradVarName("Output")},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{GradVarName("Input"), GradVarName("Filter")});
}
KernelSignature Conv2dTransposeDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv2d_transpose_grad_grad",
{"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{"DInput", "DFilter", "DDOutput"});
}
KernelSignature Conv3dTransposeOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv3d_transpose",
{"Input", "Filter"},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{"Output"});
}
KernelSignature Conv3dTransposeGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("conv3d_transpose_grad",
{"Input", "Filter", GradVarName("Output")},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{GradVarName("Input"), GradVarName("Filter")});
}
KernelSignature DepthwiseConv2dTransposeOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("depthwise_conv2d_transpose",
{"Input", "Filter"},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{"Output"});
}
KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("depthwise_conv2d_transpose_grad",
{"Input", "Filter", GradVarName("Output")},
{"strides",
"paddings",
"output_padding",
"output_size",
"padding_algorithm",
"groups",
"dilations",
"data_format"},
{GradVarName("Input"), GradVarName("Filter")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose,
phi::Conv2dTransposeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad,
phi::Conv2dTransposeGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad,
phi::Conv2dTransposeDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose,
phi::Conv3dTransposeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad,
phi::Conv3dTransposeGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose,
phi::DepthwiseConv2dTransposeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad,
phi::DepthwiseConv2dTransposeGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature FrobeniusNormOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
}
KernelSignature FrobeniusNormGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"frobenius_norm_grad",
{"X", "Out", GradVarName("Out")},
{"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad,
phi::FrobeniusNormGradOpArgumentMapping);
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import numpy as np import numpy as np
import os import os
from datetime import timedelta
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..fluid.framework import Variable from ..fluid.framework import Variable
from ..fluid.framework import OpProtoHolder from ..fluid.framework import OpProtoHolder
...@@ -73,6 +74,7 @@ class ReduceOp: ...@@ -73,6 +74,7 @@ class ReduceOp:
MAX = 1 MAX = 1
MIN = 2 MIN = 2
PROD = 3 PROD = 3
AVG = 4
class Group(): class Group():
...@@ -80,11 +82,13 @@ class Group(): ...@@ -80,11 +82,13 @@ class Group():
The abstract representation of group. The abstract representation of group.
""" """
def __init__(self, rank, rank_num, id=0, ranks=[]): def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None):
self.rank = rank self.rank = rank
self.nranks = rank_num self.nranks = rank_num
self.id = id self.id = id
self.ranks = ranks self.ranks = ranks
self.pg = pg
self.name = name
def is_member(self): def is_member(self):
if self.rank < 0: if self.rank < 0:
...@@ -99,11 +103,16 @@ class Group(): ...@@ -99,11 +103,16 @@ class Group():
else: else:
return -1 return -1
@property
def process_group(self):
return self.pg
def __repr__(self): def __repr__(self):
debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format( debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
self.rank, self.nranks, self.id) self.rank, self.nranks, self.id)
debug_str += ", ".join(map(str, self.ranks)) debug_str += ", ".join(map(str, self.ranks))
debug_str += ". " debug_str += "; name: "
debug_str += self.name if self.name else "None"
return debug_str return debug_str
...@@ -121,6 +130,17 @@ def _get_global_env(): ...@@ -121,6 +130,17 @@ def _get_global_env():
# Dict[int, Group] # Dict[int, Group]
_group_map = {} _group_map = {}
# group map by name : the map of all groups from their names
# Dict[name, Group]
_group_map_by_name = {}
# Name of the default group for init_parallel_env
_default_group_name = "_default_pg"
_valid_backend_list = ['nccl', 'gloo', 'hccl']
_default_store = None # the default tcp store
_default_backend = None
def _get_group_map(): def _get_group_map():
global _group_map global _group_map
...@@ -135,10 +155,29 @@ def _get_global_group(): ...@@ -135,10 +155,29 @@ def _get_global_group():
return _get_group_map()[0] return _get_group_map()[0]
def _get_group_map_by_name():
global _group_map_by_name
assert _default_group_name in _group_map_by_name, (
"Call paddle.distributed.init_parallel_env first "
"to initialize the distributed environment.")
return _group_map_by_name
def _get_default_group():
assert _default_group_name in _group_map_by_name, (
"Call paddle.distributed.init_parallel_env first "
"to initialize the distributed environment.")
return _get_group_map_by_name()[_default_group_name]
def _new_ring_id(): def _new_ring_id():
return len(_get_group_map()) + max(_get_global_env().nrings, 9) return len(_get_group_map()) + max(_get_global_env().nrings, 9)
def _new_group_name_id():
return len(_get_group_map_by_name()) + max(_get_global_env().nrings, 9)
def get_group(id=0): def get_group(id=0):
""" """
...@@ -163,6 +202,194 @@ def get_group(id=0): ...@@ -163,6 +202,194 @@ def get_group(id=0):
return gm[id] if id in gm else None return gm[id] if id in gm else None
def _new_process_group_impl(backend, store, rank, world_size, group_name,
pg_options):
if backend == "gloo":
gloo_store = core.GlooStore(store)
pg = None
if backend == "gloo":
pg = core.ProcessGroupGloo(gloo_store, rank, world_size)
elif backend == "nccl":
pg = core.ProcessGroupNCCL(store, rank, world_size)
elif backend == "hccl":
pg = core.ProcessGroupHCCL(store, rank, world_size)
return pg
def _init_parallel_env(rank=None,
world_size=None,
backend="nccl",
timeout=timedelta(0),
pg_options=None):
"""
Initializes the default distributed environment.
Args:
rank (int, optional): the rank of the current process or device from 0 to world_size (exclusive).
If you launch your training with paddle.distributed.run or
paddle.distributed.launch module, None can be given. Default: None.
world_size (int, optional): total number of processes or devices.
If you launch your training with paddle.distributed.run or
paddle.distributed.launch module, None can be given. Default: None.
backend (str, optional): the name of the backend used to initialize
the distributed environment. The value can be one of 'nccl' for
GPU, 'gloo' for CPU or 'hccl' for NPU. Default: 'nccl'.
timeout (datetime.timedelta, optional): timeout used for operations of
the group. Default: datetime.timedelta(0) which means no timeout.
pg_options (dict, optional): options for the group. Default: None.
Returns:
Group: a group.
Examples:
.. code-block:: python
# filename: train.py
import paddle
paddle.distributed.init_parallel_env(0, 1)
# how to start
# python paddle.distributed.run --gpus="0,1" train.py
"""
global _group_map_by_name
global _default_group_name
assert _default_group_name not in _group_map_by_name, (
"The default distributed environment has been initialized.")
assert backend in _valid_backend_list, (
"Backend must be one of {}, but the given one is: {}".format(
_valid_backend_list, backend))
_default_backend = backend
assert isinstance(timeout, timedelta), (
"timeout must be of the type datetime.timedelta.")
if rank is None or world_size is None:
assert rank is None and world_size is None, (
"rank and world_size should be unset at the same time.")
trainer_id = os.getenv("PADDLE_TRAINER_ID", None)
trainer_num = os.getenv("PADDLE_TRAINERS_NUM", None)
if trainer_id is None or trainer_num is None:
warnings.warn("If rank and world_size are both None, please start "
"your training with paddle.distributed.run or "
"paddle.distributed.launch module. Otherwise, "
"init_parallel_env will do nothing.")
return None
rank = int(trainer_id)
world_size = int(trainer_num)
assert rank >= 0 and world_size > rank and world_size > 1, (
"rank must be non-negative and world_size must be the "
"maximum rank plus one. Moreover, at least two processes are "
"required to create a process group.")
master_addr = os.getenv("MASTER_ADDR", None)
master_port = os.getenv("MASTER_PORT", None)
if not master_addr or not master_port:
endpoints = os.getenv("PADDLE_MASTER", None)
if endpoints is None:
endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
if not endpoints:
raise ValueError(
"The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
"must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
"and 'export MASTER_ADDR=54612'. Or you can start your training"
"with paddle.distributed.run or "
"paddle.distributed.luanch module.")
if ',' in endpoints:
endpoints = endpoints.split(',')[0]
master_addr, master_port = endpoints.split(":")
master_port = int(master_port)
is_master = rank == 0
global _default_store
_default_store = core.TCPStore(master_addr, master_port, is_master,
world_size, timeout)
pg = _new_process_group_impl(backend, _default_store, rank, world_size,
_default_group_name, pg_options)
ranks = list(range(world_size))
group = Group(
rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name)
paddle.fluid.dygraph.parallel_helper._set_parallel_ctx(True)
_group_map_by_name[_default_group_name] = group
return group
def _new_group(ranks=None,
backend=None,
group_name=None,
timeout=timedelta(0),
pg_options=None):
"""
Create a new process group.
Args:
ranks (list, optional): list of ranks for the new group. If None is given,
all processes is used. Default: None.
backend (str, optional): the name of the backend used to initialize
the distributed environment. Default: the one for init_parallel_env.
timeout (datetime.timedelta, optional): timeout used for operations of
the group. Default: datetime.timedelta(0).
pg_options (dict, optional): options for the group. Default: None.
Examples:
.. code-block:: python
import paddle
paddle.distributed.init_parallel_env(0, 1)
paddle.distributed.new_group([0, 1])
# how to start
# python paddle.distributed.run --gpus="0,1" train.py
"""
global _default_group_name
if group_name is None:
group_name = _default_group_name + str(_new_group_name_id())
if group_name == _default_group_name:
raise ValueError("group_name must be specified and it cannot be '{}' "
"which is used for the default process group created "
"by init_parallel_env.".format(_default_group_name))
global_group = _get_default_group()
global_rank = global_group.rank
global_ranks = global_group.ranks
if ranks is None:
ranks = global_ranks
assert len(ranks) <= len(global_ranks), (
"Size of new group must be less than or "
"equal to that of the default global group.")
size = len(ranks)
assert size > 1, "A group must have at least two memebers."
ranks = sorted(ranks)
if global_rank in ranks:
rank = ranks.index(global_rank)
pg = _new_process_group_impl(backend, _default_store, rank, size,
group_name, pg_options)
else:
rank = -1
pg = None
group = Group(
rank,
size,
id=_new_group_name_id(),
ranks=ranks,
pg=pg,
name=group_name)
_group_map_by_name[group_name] = group
return group
def barrier(group=None): def barrier(group=None):
""" """
......
...@@ -542,7 +542,7 @@ class IpuStrategy(object): ...@@ -542,7 +542,7 @@ class IpuStrategy(object):
def set_graph_config(self, def set_graph_config(self,
num_ipus=1, num_ipus=1,
is_training=True, is_training=True,
batch_size=1, micro_batch_size=1,
enable_manual_shard=False): enable_manual_shard=False):
""" """
Set graph configuration to the IpuStrategy instance. Set graph configuration to the IpuStrategy instance.
...@@ -571,7 +571,7 @@ class IpuStrategy(object): ...@@ -571,7 +571,7 @@ class IpuStrategy(object):
ipu_strategy = static.IpuStrategy() ipu_strategy = static.IpuStrategy()
ipu_strategy.set_graph_config(num_ipus=1, ipu_strategy.set_graph_config(num_ipus=1,
is_training=True, is_training=True,
batch_size=1, micro_batch_size=1,
enable_manual_shard=False) enable_manual_shard=False)
""" """
if num_ipus == 1 and enable_manual_shard: if num_ipus == 1 and enable_manual_shard:
...@@ -581,7 +581,7 @@ class IpuStrategy(object): ...@@ -581,7 +581,7 @@ class IpuStrategy(object):
options = { options = {
'num_ipus': num_ipus, 'num_ipus': num_ipus,
'is_training': is_training, 'is_training': is_training,
'micro_batch_size': batch_size, 'micro_batch_size': micro_batch_size,
'enable_manual_shard': enable_manual_shard, 'enable_manual_shard': enable_manual_shard,
} }
self.set_options(options) self.set_options(options)
...@@ -589,6 +589,7 @@ class IpuStrategy(object): ...@@ -589,6 +589,7 @@ class IpuStrategy(object):
def set_pipelining_config(self, def set_pipelining_config(self,
enable_pipelining=False, enable_pipelining=False,
batches_per_step=1, batches_per_step=1,
enable_gradient_accumulation=False,
accumulation_factor=1): accumulation_factor=1):
""" """
Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance. Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
...@@ -598,6 +599,8 @@ class IpuStrategy(object): ...@@ -598,6 +599,8 @@ class IpuStrategy(object):
Default False, which means disabled. Default False, which means disabled.
batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1. batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
Default 1, which means no data pipelining. Default 1, which means no data pipelining.
enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation.
accumulation_factor (int, optional): Specify the number of micro-batches to accumulate accumulation_factor (int, optional): Specify the number of micro-batches to accumulate
before applying the varUpdate. Default 1, which means disable the accumulation. before applying the varUpdate. Default 1, which means disable the accumulation.
...@@ -617,6 +620,7 @@ class IpuStrategy(object): ...@@ -617,6 +620,7 @@ class IpuStrategy(object):
ipu_strategy = static.IpuStrategy() ipu_strategy = static.IpuStrategy()
ipu_strategy.set_pipelining_config(enable_pipelining=False, ipu_strategy.set_pipelining_config(enable_pipelining=False,
batches_per_step=1, batches_per_step=1,
enable_gradient_accumulation=False,
accumulation_factor=1) accumulation_factor=1)
""" """
enable_manual_shard = self.get_option('enable_manual_shard') enable_manual_shard = self.get_option('enable_manual_shard')
...@@ -627,6 +631,7 @@ class IpuStrategy(object): ...@@ -627,6 +631,7 @@ class IpuStrategy(object):
options = { options = {
'enable_pipelining': enable_pipelining, 'enable_pipelining': enable_pipelining,
'batches_per_step': batches_per_step, 'batches_per_step': batches_per_step,
'enable_gradient_accumulation': enable_gradient_accumulation,
'accumulation_factor': accumulation_factor, 'accumulation_factor': accumulation_factor,
} }
self.set_options(options) self.set_options(options)
...@@ -754,6 +759,56 @@ class IpuStrategy(object): ...@@ -754,6 +759,56 @@ class IpuStrategy(object):
""" """
return self._ipu_strategy.get_option(option)['value'] return self._ipu_strategy.get_option(option)['value']
def enable_pattern(self, pattern):
"""
Enable PopART pattern to optimize the graph.
Args:
pattern(string): the name of the pattern.
Returns:
None.
Examples:
.. code-block:: python
# required: ipu
import paddle
import paddle.static as static
paddle.enable_static()
ipu_strategy = static.IpuStrategy()
ipu_strategy.enable_pattern("ViewSimplifyPattern")
"""
self._ipu_strategy.enable_pattern(pattern)
def disable_pattern(self, pattern):
"""
Disable PopART pattern.
Args:
pattern(string): the name of the pattern.
Returns:
None.
Examples:
.. code-block:: python
# required: ipu
import paddle
import paddle.static as static
paddle.enable_static()
ipu_strategy = static.IpuStrategy()
ipu_strategy.disable_pattern("ViewSimplifyPattern")
"""
self._ipu_strategy.disable_pattern(pattern)
@property @property
def num_ipus(self): def num_ipus(self):
""" """
...@@ -817,8 +872,8 @@ class IpuCompiledProgram(object): ...@@ -817,8 +872,8 @@ class IpuCompiledProgram(object):
main_prog = static.default_main_program() main_prog = static.default_main_program()
ipu_strategy = static.IpuStrategy() ipu_strategy = static.IpuStrategy()
ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
ipu_strategy.set_precision_config(enable_fp16=False) ipu_strategy.set_precision_config(enable_fp16=False)
ipu_compiled_program = static.IpuCompiledProgram( ipu_compiled_program = static.IpuCompiledProgram(
...@@ -891,8 +946,8 @@ class IpuCompiledProgram(object): ...@@ -891,8 +946,8 @@ class IpuCompiledProgram(object):
main_prog = static.default_main_program() main_prog = static.default_main_program()
ipu_strategy = static.IpuStrategy() ipu_strategy = static.IpuStrategy()
ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
ipu_strategy.set_precision_config(enable_fp16=False) ipu_strategy.set_precision_config(enable_fp16=False)
program = static.IpuCompiledProgram( program = static.IpuCompiledProgram(
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import random
import numpy as np
import os
import shutil
import paddle
from paddle.fluid import core
import datetime
from datetime import timedelta
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard
from paddle.fluid.dygraph.parallel import ParallelEnv
class TestProcessGroupFp32(unittest.TestCase):
def setUp(self):
self.config()
def config(self):
pass
def test_init_process_group(self):
paddle.distributed.collective._init_parallel_env()
paddle.distributed.collective._new_group()
with self.assertRaises(ValueError):
paddle.distributed.collective._new_group(
backend="gloo", group_name="_default_pg")
print("test ok\n")
if __name__ == "__main__":
unittest.main()
...@@ -98,5 +98,117 @@ class TestBase(IPUOpTest): ...@@ -98,5 +98,117 @@ class TestBase(IPUOpTest):
self.check(output_dict) self.check(output_dict)
class TestAssignFp32Value(TestBase):
def set_data_feed(self):
data = np.random.uniform(size=[2, 3, 1])
self.feed_fp32 = {'in_0': data.astype(np.float32)}
self.feed_fp16 = {'in_0': data.astype(np.float16)}
data = np.random.uniform(size=[2, 3, 1])
self.assign_fp32 = data.astype(np.float32)
def _test_base(self, exec_mode):
scope = paddle.static.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = self.SEED
startup_prog.random_seed = self.SEED
with paddle.static.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype='float32')
assign = paddle.assign(self.assign_fp32)
out = paddle.fluid.layers.elementwise_add(x, assign)
fetch_list = [out.name]
if exec_mode == ExecutionMode.CPU_FP32:
place = paddle.CPUPlace()
else:
place = paddle.IPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training)
if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True)
program = paddle.static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
feed = self.feed_fp32
if exec_mode > ExecutionMode.IPU_FP32:
feed = self.feed_fp16
result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0]
class TestAssignBoolValue(TestBase):
def set_data_feed(self):
data = np.random.uniform(size=[2, 3, 1])
self.feed_fp32 = {'in_0': data.astype(np.float32)}
self.feed_fp16 = {'in_0': data.astype(np.float16)}
data = np.random.choice([True, False], size=(2, 3, 1))
self.assign_bool = data.astype(np.bool)
def _test_base(self, exec_mode):
scope = paddle.static.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = self.SEED
startup_prog.random_seed = self.SEED
with paddle.static.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype='float32')
x = paddle.less_than(x, x)
assign = paddle.assign(self.assign_bool)
out = paddle.logical_and(x, assign)
out = paddle.cast(out, 'float32')
fetch_list = [out.name]
if exec_mode == ExecutionMode.CPU_FP32:
place = paddle.CPUPlace()
else:
place = paddle.IPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training)
if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True)
program = paddle.static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
feed = self.feed_fp32
if exec_mode > ExecutionMode.IPU_FP32:
feed = self.feed_fp16
result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0]
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -22,33 +22,18 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMod ...@@ -22,33 +22,18 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMod
@unittest.skipIf(not paddle.is_compiled_with_ipu(), @unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU") "core is not compiled with IPU")
class TestBase(IPUOpTest): class TestGreaterThan(IPUOpTest):
def setUp(self): def setUp(self):
self.set_atol() self.set_atol()
self.set_training() self.set_training()
self.set_data_feed() self.set_test_op()
self.set_feed_attr()
self.set_op_attrs()
@property @property
def fp16_enabled(self): def fp16_enabled(self):
return True return True
def set_data_feed(self): def set_test_op(self):
x = np.random.randn(3, 4, 5) self.op = paddle.fluid.layers.greater_than
y = np.random.randn(3, 4, 5)
self.feed_fp32 = {
"x": x.astype(np.float32),
"y": y.astype(np.float32),
}
self.feed_fp16 = {
"x": x.astype(np.float16),
"y": y.astype(np.float16),
}
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed_fp32.values()]
self.feed_list = list(self.feed_fp32.keys())
def set_op_attrs(self): def set_op_attrs(self):
self.attrs = {} self.attrs = {}
...@@ -71,7 +56,7 @@ class TestBase(IPUOpTest): ...@@ -71,7 +56,7 @@ class TestBase(IPUOpTest):
shape=self.feed_shape[1], shape=self.feed_shape[1],
dtype='float32') dtype='float32')
out = paddle.fluid.layers.greater_than(x, y, **self.attrs) out = self.op(x, y, **self.attrs)
fetch_list = [out.name] fetch_list = [out.name]
...@@ -102,7 +87,7 @@ class TestBase(IPUOpTest): ...@@ -102,7 +87,7 @@ class TestBase(IPUOpTest):
result = exe.run(program, feed=feed, fetch_list=fetch_list) result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0] return result[0]
def test(self): def run_test_base(self):
output_dict = {} output_dict = {}
for mode in ExecutionMode: for mode in ExecutionMode:
if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
...@@ -111,29 +96,73 @@ class TestBase(IPUOpTest): ...@@ -111,29 +96,73 @@ class TestBase(IPUOpTest):
self.check(output_dict) self.check(output_dict)
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed_fp32.values()]
self.feed_list = list(self.feed_fp32.keys())
def set_data_feed0(self):
x = np.random.randn(3, 4, 5)
y = np.random.randn(3, 4, 5)
self.feed_fp32 = {
"x": x.astype(np.float32),
"y": y.astype(np.float32),
}
self.feed_fp16 = {
"x": x.astype(np.float16),
"y": y.astype(np.float16),
}
self.set_feed_attr()
class TestCase1(TestBase): def set_data_feed1(self):
def set_data_feed(self):
x = np.ones([1, 10]) x = np.ones([1, 10])
y = np.ones([10]) y = np.ones([10])
self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
self.set_feed_attr()
def set_data_feed2(self):
class TestCase2(TestBase):
def set_data_feed(self):
x = np.ones([1, 10]) x = np.ones([1, 10])
y = np.zeros([1, 10]) y = np.zeros([1, 10])
self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
self.set_feed_attr()
def set_data_feed3(self):
class TestCase3(TestBase):
def set_data_feed(self):
x = np.zeros([1, 10]) x = np.zeros([1, 10])
y = np.ones([1, 10]) y = np.ones([1, 10])
self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
self.set_feed_attr()
def test_case0(self):
self.set_data_feed0()
self.set_op_attrs()
self.run_test_base()
def test_case1(self):
self.set_data_feed1()
self.set_op_attrs()
self.run_test_base()
def test_case2(self):
self.set_data_feed2()
self.set_op_attrs()
self.run_test_base()
def test_case3(self):
self.set_data_feed3()
self.set_op_attrs()
self.run_test_base()
class TestLessThan(TestGreaterThan):
def set_test_op(self):
self.op = paddle.fluid.layers.less_than
class TestEqual(TestGreaterThan):
def set_test_op(self):
self.op = paddle.fluid.layers.equal
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function import unittest
import numpy as np import numpy as np
import unittest
import paddle import paddle
import paddle.static
paddle.enable_static() paddle.enable_static()
...@@ -26,30 +26,31 @@ paddle.enable_static() ...@@ -26,30 +26,31 @@ paddle.enable_static()
class TestIpuShard(unittest.TestCase): class TestIpuShard(unittest.TestCase):
def _test(self): def _test(self):
# build graph # build graph
a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') main_prog = paddle.static.Program()
b = a + 2 # scale : scale * x + bias, ipu_index : no with paddle.static.program_guard(main_prog):
a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
b = a + 2 # scale : scale * x + bias, ipu_index : no
with paddle.static.ipu_shard_guard(index=1):
c = b + 1 # scale, ipu_index : 1
with paddle.static.ipu_shard_guard(index=2):
d = c * 2 # scale, ipu_index : 2
with paddle.static.ipu_shard_guard(index=3):
e = d + 3 # scale, ipu_index : 3
with paddle.static.ipu_shard_guard(index=1):
e = e + 3 # scale, ipu_index : 1
with paddle.static.ipu_shard_guard(index=2):
e = e + 3 # scale, ipu_index : 2
with paddle.static.ipu_shard_guard(index=1):
f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1
with paddle.static.ipu_shard_guard(index=1):
c = b + 1 # scale, ipu_index : 1
with paddle.static.ipu_shard_guard(index=2): with paddle.static.ipu_shard_guard(index=2):
d = c * 2 # scale, ipu_index : 2 g = f - 1 # scale, ipu_index : 2
with paddle.static.ipu_shard_guard(index=3):
e = d + 3 # scale, ipu_index : 3
with paddle.static.ipu_shard_guard(index=1):
e = e + 3 # scale, ipu_index : 1
with paddle.static.ipu_shard_guard(index=2):
e = e + 3 # scale, ipu_index : 2
with paddle.static.ipu_shard_guard(index=1):
f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1
with paddle.static.ipu_shard_guard(index=2): h = g + 1 # scale, ipu_index : no
g = f - 1 # scale, ipu_index : 2
h = g + 1 # scale, ipu_index : no
ipu_index_list = [] ipu_index_list = []
main_prog = paddle.static.default_main_program()
for op in main_prog.global_block().ops: for op in main_prog.global_block().ops:
if op.desc.has_attr("ipu_index"): if op.desc.has_attr("ipu_index"):
ipu_index_list.append(op.desc.attr("ipu_index")) ipu_index_list.append(op.desc.attr("ipu_index"))
...@@ -69,30 +70,31 @@ class TestIpuShard(unittest.TestCase): ...@@ -69,30 +70,31 @@ class TestIpuShard(unittest.TestCase):
class TestIpuPipeline(unittest.TestCase): class TestIpuPipeline(unittest.TestCase):
def _test(self): def _test(self):
# build graph # build graph
a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') main_prog = paddle.static.Program()
b = a + 2 # scale : scale * x + bias, ipu_stage : no with paddle.static.program_guard(main_prog):
a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
b = a + 2 # scale : scale * x + bias, ipu_stage : no
with paddle.static.ipu_shard_guard(stage=1):
c = b + 1 # scale, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=2):
d = c * 2 # scale, ipu_stage : 2
with paddle.static.ipu_shard_guard(stage=3):
e = d + 3 # scale, ipu_stage : 3
with paddle.static.ipu_shard_guard(stage=1):
e = e + 3 # scale, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=2):
e = e + 3 # scale, ipu_stage : 2
with paddle.static.ipu_shard_guard(stage=1):
f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=1):
c = b + 1 # scale, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=2): with paddle.static.ipu_shard_guard(stage=2):
d = c * 2 # scale, ipu_stage : 2 g = f - 1 # scale, ipu_stage : 2
with paddle.static.ipu_shard_guard(stage=3):
e = d + 3 # scale, ipu_stage : 3
with paddle.static.ipu_shard_guard(stage=1):
e = e + 3 # scale, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=2):
e = e + 3 # scale, ipu_stage : 2
with paddle.static.ipu_shard_guard(stage=1):
f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1
with paddle.static.ipu_shard_guard(stage=2):
g = f - 1 # scale, ipu_stage : 2
h = g + 1 # scale, ipu_stage : no h = g + 1 # scale, ipu_stage : no
ipu_index_list = [] ipu_index_list = []
main_prog = paddle.static.default_main_program()
for op in main_prog.global_block().ops: for op in main_prog.global_block().ops:
if op.desc.has_attr("ipu_stage"): if op.desc.has_attr("ipu_stage"):
ipu_index_list.append(op.desc.attr("ipu_stage")) ipu_index_list.append(op.desc.attr("ipu_stage"))
......
...@@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase): ...@@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase):
def test_set_options(self): def test_set_options(self):
ipu_strategy = paddle.static.IpuStrategy() ipu_strategy = paddle.static.IpuStrategy()
all_option_names = ipu_strategy._ipu_strategy.get_all_option_names() all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
skip_options = []
skip_options.append('random_seed')
for option_name in all_option_names: for option_name in all_option_names:
if option_name in skip_options:
continue
option = ipu_strategy._ipu_strategy.get_option(option_name) option = ipu_strategy._ipu_strategy.get_option(option_name)
option_type = option['type'] option_type = option['type']
option_value = option['value'] option_value = option['value']
...@@ -38,9 +44,13 @@ class TestIpuStrategy(unittest.TestCase): ...@@ -38,9 +44,13 @@ class TestIpuStrategy(unittest.TestCase):
set_value = not option_value set_value = not option_value
else: else:
continue continue
ipu_strategy.set_options({option_name: set_value})
new_value = ipu_strategy.get_option(option_name) try:
assert new_value == set_value, f"set {option_name} to {set_value} failed" ipu_strategy.set_options({option_name: set_value})
new_value = ipu_strategy.get_option(option_name)
assert new_value == set_value, f"set {option_name} to {set_value} failed"
except:
raise Exception(f"set {option_name} to {set_value} failed")
def test_set_string_options(self): def test_set_string_options(self):
ipu_strategy = paddle.static.IpuStrategy() ipu_strategy = paddle.static.IpuStrategy()
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.static
from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
@unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU")
class TestLogicalAnd(IPUOpTest):
def setUp(self):
self.set_atol()
self.set_training()
self.set_test_op()
@property
def fp16_enabled(self):
return False
def set_test_op(self):
self.op = paddle.fluid.layers.logical_and
def set_op_attrs(self):
self.attrs = {}
def _test_base(self, exec_mode):
scope = paddle.static.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = self.SEED
startup_prog.random_seed = self.SEED
with paddle.static.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype=self.feed_dtype[0])
y = paddle.static.data(
name=self.feed_list[1],
shape=self.feed_shape[1],
dtype=self.feed_dtype[1])
out = self.op(x, y, **self.attrs)
fetch_list = [out.name]
if exec_mode == ExecutionMode.CPU_FP32:
place = paddle.CPUPlace()
else:
place = paddle.IPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training)
if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True)
program = paddle.static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
return result[0]
def run_test_base(self):
output_dict = {}
for mode in ExecutionMode:
if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
break
output_dict[mode] = self._test_base(mode).astype(np.int32)
self.check(output_dict, check_shape=True)
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed.values()]
self.feed_list = list(self.feed.keys())
self.feed_dtype = ['bool', 'bool']
def set_data_feed0(self):
x = np.random.choice([True, False], size=(1, 3, 5, 5))
y = np.random.choice([True, False], size=(1, 3, 5, 5))
self.feed = {
"x": x.astype('bool'),
"y": y.astype('bool'),
}
self.set_feed_attr()
def test_case0(self):
self.set_data_feed0()
self.set_op_attrs()
self.run_test_base()
class TestLogicalOr(TestLogicalAnd):
def set_test_op(self):
self.op = paddle.fluid.layers.logical_or
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.static
from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
@unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU")
class TestBase(IPUOpTest):
def setUp(self):
self.set_atol()
self.set_training()
self.set_data_feed()
self.set_feed_attr()
self.set_op_attrs()
@property
def fp16_enabled(self):
return True
def set_data_feed(self):
data1 = np.array([[1], [1], [3], [0]])
self.feed = {'x': data1.astype(np.int32)}
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed.values()]
self.feed_list = list(self.feed.keys())
def set_op_attrs(self):
self.attrs = {"depth": 4, "allow_out_of_range": False}
def _test_base(self, exec_mode):
scope = paddle.static.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = self.SEED
startup_prog.random_seed = self.SEED
with paddle.static.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype='int32')
out = paddle.fluid.layers.one_hot(x, **self.attrs)
fetch_list = [out.name]
if exec_mode == ExecutionMode.CPU_FP32:
place = paddle.CPUPlace()
else:
place = paddle.IPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training)
if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True)
program = paddle.static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
feed = self.feed
result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0]
def test_base(self):
output_dict = {}
for mode in ExecutionMode:
if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
break
output_dict[mode] = self._test_base(mode).flatten()
self.check(output_dict)
@unittest.skip('does not support allow_out_of_range=True')
class TestCase1(TestBase):
def set_op_attrs(self):
self.attrs = {"depth": 4, "allow_out_of_range": True}
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.static
from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
@unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU")
class TestBase(IPUOpTest):
def setUp(self):
self.set_atol()
self.set_training()
self.set_data_feed()
self.set_feed_attr()
self.set_op_attrs()
@property
def fp16_enabled(self):
return True
def set_data_feed(self):
data1 = np.array([[1], [1], [3], [0]])
self.feed = {'x': data1.astype(np.int32)}
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed.values()]
self.feed_list = list(self.feed.keys())
def set_op_attrs(self):
self.attrs = {"depth": 4, "allow_out_of_range": False}
def _test_base(self, exec_mode):
scope = paddle.static.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = self.SEED
startup_prog.random_seed = self.SEED
with paddle.static.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype='int32')
out = paddle.fluid.input.one_hot(x, **self.attrs)
fetch_list = [out.name]
if exec_mode == ExecutionMode.CPU_FP32:
place = paddle.CPUPlace()
else:
place = paddle.IPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training)
if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True)
program = paddle.static.IpuCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
feed = self.feed
result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0]
def test_base(self):
output_dict = {}
for mode in ExecutionMode:
if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
break
output_dict[mode] = self._test_base(mode).flatten()
self.check(output_dict)
@unittest.skip('does not support allow_out_of_range=True')
class TestCase1(TestBase):
def set_op_attrs(self):
self.attrs = {"depth": 4, "allow_out_of_range": True}
if __name__ == "__main__":
unittest.main()
...@@ -91,6 +91,15 @@ class TestBase(IPUOpTest): ...@@ -91,6 +91,15 @@ class TestBase(IPUOpTest):
ipu_strategy = paddle.static.IpuStrategy() ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=True) ipu_strategy.set_graph_config(is_training=True)
ipu_strategy.loss_scaling = self.attrs["loss_scaling"] ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
if "use_no_bias_optimizer" in self.attrs.keys():
ipu_strategy.set_options({
"use_no_bias_optimizer":
self.attrs["use_no_bias_optimizer"]
})
if "accl1_type" in self.attrs.keys():
ipu_strategy.set_options({
"accl1_type": self.attrs["accl1_type"]
})
program = paddle.static.IpuCompiledProgram( program = paddle.static.IpuCompiledProgram(
main_prog, ipu_strategy=ipu_strategy).compile(feed_list, main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
fetch_list) fetch_list)
...@@ -141,6 +150,28 @@ class TestAdamCase2(TestBase): ...@@ -141,6 +150,28 @@ class TestAdamCase2(TestBase):
} }
@unittest.skip('cpu do not support AdamNoBias')
class TestAdamNoBias(TestBase):
def set_attrs(self):
self.attrs = {
"optimizer": 'adam',
"weight_decay": 0.0,
"loss_scaling": 4.0,
"use_no_bias_optimizer": True,
}
@unittest.skip('cpu do not support FLOAT16')
class TestAdamCase3(TestBase):
def set_attrs(self):
self.attrs = {
"optimizer": 'adam',
"weight_decay": 0.0,
"loss_scaling": 4.0,
"accl1_type": "FLOAT16",
}
@unittest.skip('seems cpu output wrong') @unittest.skip('seems cpu output wrong')
class TestLambCase1(TestBase): class TestLambCase1(TestBase):
def set_attrs(self): def set_attrs(self):
...@@ -161,5 +192,27 @@ class TestLamb(TestBase): ...@@ -161,5 +192,27 @@ class TestLamb(TestBase):
} }
@unittest.skip('cpu do not support LambNoBias')
class TestLambNoBias(TestBase):
def set_attrs(self):
self.attrs = {
"optimizer": 'lamb',
"weight_decay": 0.1,
"loss_scaling": 6.0,
"use_no_bias_optimizer": True
}
@unittest.skip('cpu do not support FLOAT16')
class TestLambCase2(TestBase):
def set_attrs(self):
self.attrs = {
"optimizer": 'lamb',
"weight_decay": 0.1,
"loss_scaling": 6.0,
"accl1_type": "FLOAT16"
}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -95,12 +95,9 @@ class TestBase(IPUOpTest): ...@@ -95,12 +95,9 @@ class TestBase(IPUOpTest):
is_training=self.attrs['is_training']) is_training=self.attrs['is_training'])
ipu_strategy.set_precision_config( ipu_strategy.set_precision_config(
enable_fp16=self.attrs['enable_fp16']) enable_fp16=self.attrs['enable_fp16'])
ipu_strategy.set_options({ ipu_program = paddle.static.IpuCompiledProgram(
'save_per_n_step': self.attrs['save_at_step'] main_prog, ipu_strategy=ipu_strategy)
}) program = ipu_program.compile(self.feed_list, fetch_list)
program = paddle.static.IpuCompiledProgram(
main_prog, ipu_strategy=ipu_strategy).compile(
self.feed_list, fetch_list)
result = [] result = []
run_steps = self.attrs['steps'] if save_otherwise_load \ run_steps = self.attrs['steps'] if save_otherwise_load \
...@@ -111,10 +108,9 @@ class TestBase(IPUOpTest): ...@@ -111,10 +108,9 @@ class TestBase(IPUOpTest):
for i in range(run_steps): for i in range(run_steps):
tmp = exe.run(program, feed=feed, fetch_list=fetch_list) tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
# currently, we update opt state every sess.run,
# will optimize
if save_otherwise_load and \ if save_otherwise_load and \
i == self.attrs['save_at_step'] - 1: i == self.attrs['save_at_step'] - 1:
ipu_program._backend.weights_to_host()
paddle.static.save(main_prog, paddle.static.save(main_prog,
self.attrs['model_path'].name) self.attrs['model_path'].name)
......
...@@ -88,11 +88,10 @@ class TestBase(IPUOpTest): ...@@ -88,11 +88,10 @@ class TestBase(IPUOpTest):
if exec_mode != ExecutionMode.CPU_FP32: if exec_mode != ExecutionMode.CPU_FP32:
feed_list = self.feed_list feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy() ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.set_graph_config(is_training=self.is_training) ipu_strategy.set_graph_config(
is_training=self.is_training, micro_batch_size=2)
if exec_mode == ExecutionMode.IPU_POPART_FP16: if exec_mode == ExecutionMode.IPU_POPART_FP16:
ipu_strategy.set_precision_config(enable_fp16=True) ipu_strategy.set_precision_config(enable_fp16=True)
# set batch size
ipu_strategy.micro_batch_size = 2
program = paddle.static.IpuCompiledProgram( program = paddle.static.IpuCompiledProgram(
main_prog, main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
......
...@@ -25,6 +25,9 @@ class TestProcessGroup(TestMultipleGpus): ...@@ -25,6 +25,9 @@ class TestProcessGroup(TestMultipleGpus):
def test_process_group_gloo(self): def test_process_group_gloo(self):
self.run_mnist_2gpu('process_group_gloo.py') self.run_mnist_2gpu('process_group_gloo.py')
def test_init_process_group(self):
self.run_mnist_2gpu('init_process_group.py')
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -116,6 +116,54 @@ class TestEagerGrad(TestCase): ...@@ -116,6 +116,54 @@ class TestEagerGrad(TestCase):
self.func_simple_example_eager_grad_not_allow_unused() self.func_simple_example_eager_grad_not_allow_unused()
self.func_simple_example_eager_grad_not_allow_unused() self.func_simple_example_eager_grad_not_allow_unused()
def func_simple_example_eager_grad_duplicate_input(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
try:
# duplicate input will arise RuntimeError errors
dx = fluid.dygraph.grad(out, [x, x])
except RuntimeError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("duplicate") > 0
def test_simple_example_eager_grad_duplicate_input(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_duplicate_input()
self.func_simple_example_eager_grad_duplicate_input()
def func_simple_example_eager_grad_duplicate_output(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
try:
# duplicate output will arise RuntimeError errors
dx = fluid.dygraph.grad([out, out], [x])
except RuntimeError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("duplicate") > 0
def test_simple_example_eager_grad_duplicate_output(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_duplicate_output()
self.func_simple_example_eager_grad_duplicate_output()
class TestDygraphDoubleGrad(TestCase): class TestDygraphDoubleGrad(TestCase):
def setUp(self): def setUp(self):
......
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from op_test_xpu import XPUOpTest from op_test_xpu import XPUOpTest
import paddle import paddle
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
def conv2d_forward_naive(input, def conv2d_forward_naive(input,
...@@ -159,320 +160,334 @@ def create_test_padding_VALID_class(parent): ...@@ -159,320 +160,334 @@ def create_test_padding_VALID_class(parent):
globals()[cls_name] = TestPaddingVALIDCase globals()[cls_name] = TestPaddingVALIDCase
class TestConv2DOp(XPUOpTest): class XPUTestConv2DOp(XPUOpTestWrapper):
def setUp(self): def __init__(self):
self.op_type = "conv2d" self.op_name = 'conv2d'
self.use_cudnn = False self.use_dynamic_create_class = False
self.exhaustive_search = False
self.use_cuda = False class TestConv2DOp(XPUOpTest):
self.use_mkldnn = False def setUp(self):
self.fuse_relu_before_depthwise_conv = False self.dtype = self.in_type
self.data_format = "AnyLayout" self.place = paddle.XPUPlace(0)
self.dtype = np.float32 self.op_type = "conv2d"
self.init_kernel_type() self.use_cudnn = False
self.init_group() self.exhaustive_search = False
self.init_dilation() self.use_cuda = False
self.init_test_case() self.use_mkldnn = False
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
input = np.random.random(self.input_size).astype(self.dtype)
if not self.has_cuda():
self.fuse_relu_before_depthwise_conv = False self.fuse_relu_before_depthwise_conv = False
if self.fuse_relu_before_depthwise_conv: self.data_format = "AnyLayout"
input = input - 0.5 self.init_kernel_type()
input -= (input < 0) * 0.1 self.init_group()
input += (input >= 0) * 0.1 self.init_dilation()
input2 = np.maximum(input, 0.0) self.init_test_case()
else:
input2 = input conv2d_param = {
filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) 'stride': self.stride,
'pad': self.pad,
output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups, 'dilation': self.dilations
conv2d_param) }
output = output.astype(self.dtype)
np.random.seed(100)
self.inputs = { input = np.random.random(self.input_size).astype(self.dtype)
'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), if not self.has_cuda():
'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) self.fuse_relu_before_depthwise_conv = False
} if self.fuse_relu_before_depthwise_conv:
self.attrs = { input = input - 0.5
'strides': self.stride, input -= (input < 0) * 0.1
'paddings': self.pad, input += (input >= 0) * 0.1
'groups': self.groups, input2 = np.maximum(input, 0.0)
'dilations': self.dilations, else:
'use_cudnn': self.use_cudnn, input2 = input
'use_mkldnn': self.use_mkldnn, np.random.seed(1)
'data_format': self.data_format, filter = np.random.uniform(-1, 1,
'fuse_relu_before_depthwise_conv': self.filter_size).astype(self.dtype)
self.fuse_relu_before_depthwise_conv,
'exhaustive_search': self.exhaustive_search output, _, _, _, _ = conv2d_forward_naive(input2, filter,
} self.groups, conv2d_param)
self.outputs = {'Output': output} output = output.astype(self.dtype)
def has_cuda(self): self.inputs = {
return core.is_compiled_with_cuda() and (self.use_cudnn or 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
self.use_cuda) 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
}
def test_check_output(self): self.attrs = {
if core.is_compiled_with_xpu(): 'strides': self.stride,
paddle.enable_static() 'paddings': self.pad,
place = paddle.XPUPlace(0) 'groups': self.groups,
self.check_output_with_place(place) 'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
def test_check_grad(self): 'use_mkldnn': self.use_mkldnn,
if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and 'data_format': self.data_format,
self.no_need_check_grad == True): 'fuse_relu_before_depthwise_conv':
return self.fuse_relu_before_depthwise_conv,
if core.is_compiled_with_xpu(): 'exhaustive_search': self.exhaustive_search
paddle.enable_static() }
place = paddle.XPUPlace(0) self.outputs = {'Output': output}
self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
def has_cuda(self):
def test_check_grad_no_filter(self): return core.is_compiled_with_cuda() and (self.use_cudnn or
if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and self.use_cuda)
self.no_need_check_grad == True):
return def test_check_output(self):
if core.is_compiled_with_xpu(): if core.is_compiled_with_xpu():
paddle.enable_static() paddle.enable_static()
place = paddle.XPUPlace(0) self.check_output_with_place(self.place)
self.check_grad_with_place(
place, ['Input'], 'Output', no_grad_set=set(['Filter'])) def test_check_grad(self):
if (hasattr(self, "no_need_check_grad") and
def test_check_grad_no_input(self): self.no_need_check_grad == True):
if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and return
self.no_need_check_grad == True): if core.is_compiled_with_xpu():
return paddle.enable_static()
if core.is_compiled_with_xpu(): self.check_grad_with_place(self.place, {'Input', 'Filter'},
paddle.enable_static() 'Output')
place = paddle.XPUPlace(0)
self.check_grad_with_place( def test_check_grad_no_filter(self):
place, ['Filter'], 'Output', no_grad_set=set(['Input'])) if (hasattr(self, "no_need_check_grad") and
self.no_need_check_grad == True):
def init_test_case(self): return
self.pad = [0, 0] if core.is_compiled_with_xpu():
self.stride = [1, 1] paddle.enable_static()
self.input_size = [2, 3, 5, 5] # NCHW self.check_grad_with_place(
assert np.mod(self.input_size[1], self.groups) == 0 self.place, ['Input'],
f_c = self.input_size[1] // self.groups 'Output',
self.filter_size = [6, f_c, 3, 3] no_grad_set=set(['Filter']))
def init_test_case_2(self): def test_check_grad_no_input(self):
pass if (hasattr(self, "no_need_check_grad") and
self.no_need_check_grad == True):
def init_dilation(self): return
self.dilations = [1, 1] if core.is_compiled_with_xpu():
paddle.enable_static()
def init_group(self): self.check_grad_with_place(
self.groups = 1 self.place, ['Filter'],
'Output',
def init_kernel_type(self): no_grad_set=set(['Input']))
pass
def init_test_case(self):
self.pad = [0, 0]
class TestWithPad(TestConv2DOp): self.stride = [1, 1]
def init_test_case(self): self.input_size = [2, 3, 5, 5] # NCHW
self.pad = [1, 1] assert np.mod(self.input_size[1], self.groups) == 0
self.stride = [1, 1] f_c = self.input_size[1] // self.groups
self.input_size = [2, 3, 5, 5] # NCHW self.filter_size = [6, f_c, 3, 3]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithStride(TestConv2DOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWith1x1(TestConv2DOp):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [120, f_c, 1, 1]
def init_group(self):
self.groups = 1
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
# ---- test asymmetric padding ---- def init_test_case_2(self):
pass
def init_dilation(self):
self.dilations = [1, 1]
def init_group(self):
self.groups = 1
def init_kernel_type(self):
pass
class TestWithPad(TestConv2DOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithStride(TestConv2DOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWith1x1(TestConv2DOp):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [120, f_c, 1, 1]
def init_group(self):
self.groups = 1
class TestConv2DOp_v2(XPUOpTest):
def setUp(self): # ---- test asymmetric padding ----
self.op_type = "conv2d" class XPUTestConv2DOp_v2(XPUOpTestWrapper):
self.use_cudnn = False def __init__(self):
self.exhaustive_search = False self.op_name = 'conv2d'
self.use_cuda = False self.use_dynamic_create_class = False
self.use_mkldnn = False
self.fuse_relu_before_depthwise_conv = False class TestConv2DOp_v2(XPUOpTest):
self.dtype = np.float32 def setUp(self):
self.init_kernel_type() self.dtype = self.in_type
self.init_group() self.place = paddle.XPUPlace(0)
self.init_dilation() self.op_type = "conv2d"
self.init_data_format() self.use_cudnn = False
self.init_test_case() self.exhaustive_search = False
self.init_paddings() self.use_cuda = False
self.init_test_case_2() self.use_mkldnn = False
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
input = np.random.random(self.input_size).astype(self.dtype)
if not self.has_cuda():
self.fuse_relu_before_depthwise_conv = False self.fuse_relu_before_depthwise_conv = False
if self.fuse_relu_before_depthwise_conv: self.init_kernel_type()
input = input - 0.5 self.init_group()
input -= (input < 0) * 0.1 self.init_dilation()
input += (input >= 0) * 0.1 self.init_data_format()
input2 = np.maximum(input, 0.0) self.init_test_case()
else: self.init_paddings()
input2 = input self.init_test_case_2()
filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
output, _, _, _, _ = conv2d_forward_naive( conv2d_param = {
input2, filter, self.groups, conv2d_param, self.padding_algorithm, 'stride': self.stride,
self.data_format) 'pad': self.pad,
output = output.astype(self.dtype) 'dilation': self.dilations
}
self.inputs = {
'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), np.random.seed(100)
'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter) input = np.random.random(self.input_size).astype(self.dtype)
} if not self.has_cuda():
self.attrs = { self.fuse_relu_before_depthwise_conv = False
'strides': self.stride, if self.fuse_relu_before_depthwise_conv:
'paddings': self.pad, input = input - 0.5
'padding_algorithm': self.padding_algorithm, input -= (input < 0) * 0.1
'groups': self.groups, input += (input >= 0) * 0.1
'dilations': self.dilations, input2 = np.maximum(input, 0.0)
'use_cudnn': self.use_cudnn, else:
'use_mkldnn': self.use_mkldnn, input2 = input
'data_format': self.data_format, np.random.seed(8)
'fuse_relu_before_depthwise_conv': filter = np.random.uniform(-1, 1,
self.fuse_relu_before_depthwise_conv, self.filter_size).astype(self.dtype)
'exhaustive_search': self.exhaustive_search output, _, _, _, _ = conv2d_forward_naive(
} input2, filter, self.groups, conv2d_param,
self.outputs = {'Output': output} self.padding_algorithm, self.data_format)
output = output.astype(self.dtype)
def has_cuda(self):
return core.is_compiled_with_cuda() and (self.use_cudnn or self.inputs = {
self.use_cuda) 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
def test_check_output(self): }
# TODO(wangzhongpu): support mkldnn op in dygraph mode self.attrs = {
if core.is_compiled_with_xpu(): 'strides': self.stride,
paddle.enable_static() 'paddings': self.pad,
place = paddle.XPUPlace(0) 'padding_algorithm': self.padding_algorithm,
self.check_output_with_place(place) 'groups': self.groups,
'dilations': self.dilations,
def test_check_grad(self): 'use_cudnn': self.use_cudnn,
# TODO(wangzhongpu): support mkldnn op in dygraph mode 'use_mkldnn': self.use_mkldnn,
if self.dtype == np.float16: 'data_format': self.data_format,
return 'fuse_relu_before_depthwise_conv':
if core.is_compiled_with_xpu(): self.fuse_relu_before_depthwise_conv,
paddle.enable_static() 'exhaustive_search': self.exhaustive_search
place = paddle.XPUPlace(0) }
self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output') self.outputs = {'Output': output}
def test_check_grad_no_filter(self): def has_cuda(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode return core.is_compiled_with_cuda() and (self.use_cudnn or
if self.dtype == np.float16: self.use_cuda)
return
if core.is_compiled_with_xpu(): def test_check_output(self):
paddle.enable_static() # TODO(wangzhongpu): support mkldnn op in dygraph mode
place = paddle.XPUPlace(0) if core.is_compiled_with_xpu():
self.check_grad_with_place( paddle.enable_static()
place, ['Input'], 'Output', no_grad_set=set(['Filter'])) self.check_output_with_place(place=self.place)
def test_check_grad_no_input(self): def test_check_grad(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode # TODO(wangzhongpu): support mkldnn op in dygraph mode
if self.dtype == np.float16: if (hasattr(self, "no_need_check_grad") and
return self.no_need_check_grad == True):
if core.is_compiled_with_xpu(): return
paddle.enable_static() if core.is_compiled_with_xpu():
place = paddle.XPUPlace(0) paddle.enable_static()
self.check_grad_with_place( self.check_grad_with_place(self.place, {'Input', 'Filter'},
place, ['Filter'], 'Output', no_grad_set=set(['Input'])) 'Output')
def init_test_case(self): def test_check_grad_no_filter(self):
self.pad = [0, 0] # TODO(wangzhongpu): support mkldnn op in dygraph mode
self.stride = [1, 2] if (hasattr(self, "no_need_check_grad") and
self.input_size = [2, 3, 5, 5] # NCHW self.no_need_check_grad == True):
assert np.mod(self.input_size[1], self.groups) == 0 return
f_c = self.input_size[1] // self.groups if core.is_compiled_with_xpu():
self.filter_size = [6, f_c, 4, 3] paddle.enable_static()
self.check_grad_with_place(
def init_dilation(self): self.place, ['Input'],
self.dilations = [1, 1] 'Output',
no_grad_set=set(['Filter']))
def init_group(self):
self.groups = 1 def test_check_grad_no_input(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
def init_kernel_type(self): if (hasattr(self, "no_need_check_grad") and
pass self.no_need_check_grad == True):
return
def init_paddings(self): if core.is_compiled_with_xpu():
self.pad = [0, 0] paddle.enable_static()
self.padding_algorithm = "EXPLICIT" self.check_grad_with_place(
self.place, ['Filter'],
def init_data_format(self): 'Output',
self.data_format = "NCHW" no_grad_set=set(['Input']))
def init_test_case_2(self): def init_test_case(self):
pass self.pad = [0, 0]
self.stride = [1, 2]
self.input_size = [2, 3, 5, 5] # NCHW
class TestConv2DOp_AsyPadding(TestConv2DOp_v2): assert np.mod(self.input_size[1], self.groups) == 0
def init_paddings(self): f_c = self.input_size[1] // self.groups
self.pad = [0, 0, 0, 0] self.filter_size = [6, f_c, 4, 3]
self.padding_algorithm = "EXPLICIT"
def init_dilation(self):
self.dilations = [1, 1]
class TestWithPad_AsyPadding(TestConv2DOp_v2):
def init_test_case(self): def init_group(self):
self.stride = [1, 1] self.groups = 1
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0 def init_kernel_type(self):
f_c = self.input_size[1] // self.groups pass
self.filter_size = [6, f_c, 3, 3]
def init_paddings(self):
def init_paddings(self): self.pad = [0, 0]
self.pad = [1, 1, 1, 1] self.padding_algorithm = "EXPLICIT"
self.padding_algorithm = "EXPLICIT"
def init_data_format(self):
self.data_format = "NCHW"
class TestWithStride_AsyPadding(TestConv2DOp_v2):
def init_test_case(self): def init_test_case_2(self):
self.stride = [2, 2] pass
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0 class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
f_c = self.input_size[1] // self.groups def init_paddings(self):
self.filter_size = [6, f_c, 3, 3] self.pad = [0, 0, 0, 0]
self.padding_algorithm = "EXPLICIT"
def init_paddings(self):
self.pad = [1, 1, 1, 1] class TestWithPad_AsyPadding(TestConv2DOp_v2):
self.padding_algorithm = "EXPLICIT" def init_test_case(self):
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
def init_paddings(self):
self.pad = [1, 1, 1, 1]
self.padding_algorithm = "EXPLICIT"
class TestWithStride_AsyPadding(TestConv2DOp_v2):
def init_test_case(self):
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
def init_paddings(self):
self.pad = [1, 1, 1, 1]
self.padding_algorithm = "EXPLICIT"
support_types = get_xpu_op_support_types('conv2d')
for stype in support_types:
create_test_class(globals(), XPUTestConv2DOp, stype)
create_test_class(globals(), XPUTestConv2DOp_v2, stype)
#---------- test SAME VALID ----------- #---------- test SAME VALID -----------
#create_test_padding_SAME_class(TestConv2DOp_AsyPadding) #create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
......
...@@ -52,4 +52,7 @@ loss = exe = fluid.Executor(cpu) ...@@ -52,4 +52,7 @@ loss = exe = fluid.Executor(cpu)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe) fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe)
fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe, None,
"fc.pdmodel", "fc.pdiparams")
print('output name', fc_out.name) print('output name', fc_out.name)
...@@ -49,7 +49,7 @@ all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_k ...@@ -49,7 +49,7 @@ all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_k
for ir in $all_ir_name for ir in $all_ir_name
do do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
...@@ -62,7 +62,7 @@ all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_k ...@@ -62,7 +62,7 @@ all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_k
for ir in $all_ir_name for ir in $all_ir_name
do do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
......
...@@ -133,11 +133,11 @@ namespace kernel { ...@@ -133,11 +133,11 @@ namespace kernel {
def gen_context(val): def gen_context(val):
if val == "CPU": if val == "CPU":
return "phi::CPUContext", "phi_cpu" return "::phi::CPUContext", "phi_cpu"
elif val == "GPU": elif val == "GPU":
return "phi::GPUContext", "phi_gpu" return "::phi::GPUContext", "phi_gpu"
# elif val == "XPU": # elif val == "XPU":
# return "phi::XPUContext", "phi_xpu" # return "::phi::XPUContext", "phi_xpu"
else: else:
# raise Exception(f"Unknown context type {val}") # raise Exception(f"Unknown context type {val}")
return "", "" return "", ""
...@@ -157,12 +157,12 @@ def gen_kernel_func(val, ctx_name, dtype_name): ...@@ -157,12 +157,12 @@ def gen_kernel_func(val, ctx_name, dtype_name):
ed = val.index('>') ed = val.index('>')
func_name = val[:st] func_name = val[:st]
template_name = val[st + 1:ed] template_name = val[st + 1:ed]
if 'phi::' in template_name: if '::phi::' in template_name:
return "&phi::" + val return "&::phi::" + val
else: else:
return "&phi::" + func_name + "<phi::" + template_name + ">" return "&::phi::" + func_name + "<::phi::" + template_name + ">"
else: else:
return "&phi::" + val + "<" + dtype_name + ", " + ctx_name + ">" return "&::phi::" + val + "<" + dtype_name + ", " + ctx_name + ">"
def gen_dtype(vals: List[str]): def gen_dtype(vals: List[str]):
...@@ -227,7 +227,7 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): ...@@ -227,7 +227,7 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
return "" return ""
item[2] = gen_layout(item[2]) item[2] = gen_layout(item[2])
ir_dtypes, origin_dtypes = gen_dtype(item[4:-1]) ir_dtypes, origin_dtypes = gen_dtype(item[4:-1])
infer_shape_func = "&phi::" + item[-1] infer_shape_func = "&::phi::" + item[-1]
res = "" res = ""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册