未验证 提交 883ee1a3 编写于 作者: W wanghuancoder 提交者: GitHub

Merge branch 'develop' into revert-37926-eager_coreops_500

......@@ -75,6 +75,11 @@ class Carrier final {
bool IsInit() const;
// NOTE: This mutex will be used in interceptor's RunOps function.
// This mutex is used for avoiding forward ops and backward ops run
// simultaneously, which will lead to a random hang for some sync ops.
std::mutex run;
DISABLE_COPY_AND_ASSIGN(Carrier);
private:
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
......@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
}
void ComputeInterceptor::RunOps() {
Carrier& carrier_instance = Carrier::Instance();
std::unique_lock<std::mutex> lock(carrier_instance.run);
VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
<< step_ + 1 << " time.";
for (auto op : node_->ops()) {
......
......@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
#ifdef PADDLE_WITH_IPU
void operator()(const paddle::platform::IPUPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::IPUPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
void operator()(const paddle::platform::NPUPinnedPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
......
......@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
return device;
}
inline ::DLDevice operator()(const platform::IPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::IPUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
......
......@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_ipu_place(place_)) {
#ifdef PADDLE_WITH_IPU
gc.reset(new IPUGarbageCollector(
BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
......@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
paddle::framework::ir::FCElementwiseLayerNormFusePass);
REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("fc", 0)
.LE("elementwise_add", 1)
.EQ("layer_norm", 0));
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter AvgShardPass::ApplyImpl";
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetInstance();
if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
VLOG(10) << "start AvgShardPass";
auto nodes = ir::TopologySortOperations(*graph);
auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
int shard_position = nodes.size() / num_ipus;
int index_and_stage = -1;
for (int i = 0; i < nodes.size(); i++) {
if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
index_and_stage++;
}
nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
}
VLOG(10) << "end AvgShardPass";
}
VLOG(10) << "leave AvgShardPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class AvgShardPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
{OpRole::kForward, {}}, {OpRole::kBackward, {}},
{OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
{OpRole::kDist, {}}, {OpRole::kLRSched, {}},
{OpRole::kLoss, {}}, {OpRole::kNotSpecified, {}}};
for (auto* node : graph->Nodes()) {
if (!node->IsOp()) {
continue;
}
auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
if (op_role == static_cast<int>(OpRole::kForward)) {
all_ops[OpRole::kForward].insert(node);
} else if (op_role == static_cast<int>(OpRole::kBackward)) {
all_ops[OpRole::kBackward].insert(node);
} else if (op_role == static_cast<int>(OpRole::kOptimize)) {
all_ops[OpRole::kOptimize].insert(node);
} else if (op_role == static_cast<int>(OpRole::kRPC)) {
} else if (op_role == static_cast<int>(OpRole::kDist)) {
} else if (op_role == static_cast<int>(OpRole::kLRSched)) {
} else if (op_role == static_cast<int>(OpRole::kLoss)) {
all_ops[OpRole::kLoss].insert(node);
} else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
}
}
std::unordered_set<ir::Node*> forward_vars;
std::unordered_set<ir::Node*> backward_vars;
std::unordered_set<ir::Node*> control_vars;
// forward_vars
for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
for (auto* node : nodes) {
for (auto* in_node : node->inputs) {
forward_vars.insert(in_node);
}
for (auto* out_node : node->outputs) {
forward_vars.insert(out_node);
}
}
}
// control_vars & backward_vars
for (auto* node : graph->Nodes()) {
if (!node->IsVar()) {
continue;
}
if (node->IsCtrlVar()) {
control_vars.insert(node);
}
for (auto* in_node : node->inputs) {
if (all_ops[OpRole::kOptimize].count(in_node)) {
backward_vars.insert(node);
}
}
}
// all removed node
std::unordered_set<ir::Node*> rm_nodes;
for (auto* node : graph->Nodes()) {
if (backward_vars.count(node)) {
rm_nodes.insert(node);
} else if (control_vars.count(node)) {
rm_nodes.insert(node);
} else if (all_ops[OpRole::kBackward].count(node)) {
rm_nodes.insert(node);
} else if (all_ops[OpRole::kForward].count(node) == 0 &&
all_ops[OpRole::kLoss].count(node) == 0 &&
forward_vars.count(node) == 0) {
rm_nodes.insert(node);
} else if (node->Name() == "feed" || node->Name() == "fetch") {
rm_nodes.insert(node);
}
}
VLOG(10) << "Remove Node: ";
for (auto* node : rm_nodes) {
// rm node releations
for (auto* node_in : node->inputs) {
for (size_t i = 0; i < node_in->outputs.size(); ++i) {
if (node_in->outputs[i] == node) {
node_in->outputs.erase(node_in->outputs.begin() + i);
break;
}
}
}
for (auto* node_out : node->outputs) {
for (size_t i = 0; i < node_out->inputs.size(); ++i) {
if (node_out->inputs[i] == node) {
node_out->inputs.erase(node_out->inputs.begin() + i);
break;
}
}
}
VLOG(10) << "\t" << node->Name();
graph->RemoveNode(node);
}
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(forward_graph_extract_pass,
paddle::framework::ir::ForwardGraphExtractPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class ForwardGraphExtractPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void InferShapePass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter InferShapePass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetInstance();
auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
auto feed_list = Get<std::vector<std::string>>("feed_list");
for (auto node : graph->Nodes()) {
if (!node->IsVar()) {
continue;
}
bool is_feed = std::find(feed_list.begin(), feed_list.end(),
node->Name()) != feed_list.end();
if (is_feed) {
auto input_shape = node->Var()->GetShape();
if (input_shape[0] <= -1) {
input_shape[0] = batch_size;
node->Var()->SetShape(input_shape);
}
// int64->int32
if (node->Var()->GetDataType() == proto::VarType::INT64) {
node->Var()->SetDataType(proto::VarType::INT32);
}
}
}
// temp scope for shape inference
std::shared_ptr<paddle::framework::Scope> scope(
new paddle::framework::Scope());
for (auto node : graph->Nodes()) {
if (!node->IsVar()) {
continue;
}
auto var_desc = node->Var();
auto* ptr = scope->Var(var_desc->Name());
paddle::framework::InitializeVariable(ptr, var_desc->GetType());
auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
}
// infer shape
auto nodes = ir::TopologySortOperations(*graph);
for (auto node : nodes) {
auto op_desc = node->Op();
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
for (int i = 0; i < it->second.size(); i++) {
auto output_name = op_desc->Output(it->first)[i];
auto dim =
it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
auto new_shape = paddle::framework::vectorize(dim);
for (auto output_node : node->outputs) {
if (output_node->Name() == output_name) {
output_node->Var()->SetShape(new_shape);
}
}
}
}
}
// release the temp scope
scope.reset();
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave InferShapePass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
.RequirePassAttr("feed_list");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class InferShapePass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
std::vector<std::string> feed_list;
feed_list = Get<std::vector<std::string>>("feed_list");
std::vector<std::string> fetch_list;
fetch_list = Get<std::vector<std::string>>("fetch_list");
auto *feed_var = new paddle::framework::VarDesc("feed");
feed_var->SetType(proto::VarType::FEED_MINIBATCH);
auto *feed_var_node = graph->CreateVarNode(feed_var);
auto *fetch_var = new paddle::framework::VarDesc("fetch");
fetch_var->SetType(proto::VarType::FETCH_LIST);
auto *fetch_var_node = graph->CreateVarNode(fetch_var);
for (int i = 0; i < feed_list.size(); i++) {
for (auto node : graph->Nodes()) {
if (node->Name() == feed_list[i]) {
auto *op = new paddle::framework::OpDesc();
op->SetType("feed");
op->SetInput("X", {"feed"});
op->SetOutput("Out", {node->Name()});
op->SetAttr("col", i);
auto *op_node = graph->CreateOpNode(op);
node->inputs.push_back(op_node);
op_node->outputs.push_back(node);
feed_var_node->outputs.push_back(op_node);
op_node->inputs.push_back(feed_var_node);
break;
}
}
}
for (int i = 0; i < fetch_list.size(); i++) {
for (auto node : graph->Nodes()) {
if (node->Name() == fetch_list[i]) {
auto *op = new paddle::framework::OpDesc();
op->SetType("fetch");
op->SetInput("X", {node->Name()});
op->SetOutput("Out", {"fetch"});
op->SetAttr("col", i);
auto *op_node = graph->CreateOpNode(op);
node->outputs.push_back(op_node);
op_node->inputs.push_back(node);
fetch_var_node->inputs.push_back(op_node);
op_node->outputs.push_back(fetch_var_node);
break;
}
}
}
VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(inference_postprocess_pass,
paddle::framework::ir::InferencePostprocessPass)
.RequirePassAttr("feed_list")
.RequirePassAttr("fetch_list");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class InferencePostprocessPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
// Get a new instance of ipu_backend
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetNewInstance();
// Set scope
auto& scope = graph->Get<Scope>(kParamScopeAttr);
ipu_backend->SetScope(scope);
// Set ipu_strategy
static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
new platform::ipu::IpuStrategy());
ipu_strategy_instance_->is_training = false;
auto num_ipus = graph->Get<int>("num_ipus");
ipu_strategy_instance_->num_ipus = num_ipus;
if (num_ipus > 1) {
ipu_strategy_instance_->popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Manual;
} else {
ipu_strategy_instance_->popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Off;
}
auto enable_pipelining = graph->Get<bool>("enable_pipelining");
ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
if (enable_pipelining) {
auto batches_per_step = graph->Get<int>("batches_per_step");
PADDLE_ENFORCE_GE(
batches_per_step, num_ipus,
platform::errors::InvalidArgument("Batched per step should be equal or "
"greater than the number of IPUs"));
ipu_strategy_instance_->batches_per_step = batches_per_step;
}
ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
// Get feed_list and fetch list
std::vector<std::string> feed_list = {};
std::vector<std::string> fetch_list = {};
for (auto node : graph->Nodes()) {
if (node->Name() == "feed") {
if (node->IsOp()) {
feed_list.push_back("");
}
} else if (node->Name() == "fetch") {
if (node->IsOp()) {
fetch_list.push_back("");
}
}
}
for (auto node : graph->Nodes()) {
if (node->Name() == "feed") {
if (node->IsOp()) {
feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
node->outputs[0]->Name();
}
} else if (node->Name() == "fetch") {
if (node->IsOp()) {
fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
node->inputs[0]->Name();
}
}
}
// Run passes
std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
"infer_shape_pass", "avg_shard_pass",
"popart_canonicalization_pass"};
std::vector<std::string> compile_pass = {
"ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
"inference_postprocess_pass"};
for (auto pass_name : graph_pass) {
auto pass = PassRegistry::Instance().Get(pass_name);
if (pass_name == "infer_shape_pass") {
pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
feed_list.end()));
}
pass->Apply(graph);
}
for (auto pass_name : compile_pass) {
auto pass = PassRegistry::Instance().Get(pass_name);
pass->Set("feed_list",
new std::vector<std::string>(feed_list.begin(), feed_list.end()));
pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
fetch_list.end()));
pass->Apply(graph);
}
VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(inference_process_pass,
paddle::framework::ir::InferenceProcessPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class InferenceProcessPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace paddle {
namespace framework {
namespace ir {
void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
std::vector<std::string> feed_list;
feed_list = Get<std::vector<std::string>>("feed_list");
std::vector<std::string> fetch_list;
fetch_list = Get<std::vector<std::string>>("fetch_list");
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetInstance();
ipu_backend->Compile(graph, feed_list, fetch_list);
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(ipu_graph_builder_pass,
paddle::framework::ir::IpuGraphBuilderPass)
.RequirePassAttr("feed_list")
.RequirePassAttr("fetch_list");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IpuGraphBuilderPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
std::string GenerateVarName(Node *node) {
return node->Name() + "_" + std::to_string(node->id());
}
void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
// use this pass after forward_graph_extract_pass
// raise error if the inplaced var both in feed_list & fetch_list
VLOG(10) << "enter IpuInplacePass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
std::vector<std::string> feed_list;
feed_list = Get<std::vector<std::string>>("feed_list");
std::vector<std::string> fetch_list;
fetch_list = Get<std::vector<std::string>>("fetch_list");
std::map<std::string, int> var_name;
for (auto *node : graph->Nodes()) {
if (node->IsVar()) {
if (var_name.find(node->Name()) == var_name.end()) {
var_name.emplace(node->Name(), 1);
} else {
var_name[node->Name()]++;
}
}
}
for (auto *node : graph->Nodes()) {
if (node->IsVar()) {
if (var_name[node->Name()] > 1) {
auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
node->Name()) != feed_list.end()) &&
(node->inputs.size() == 0);
auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
node->Name()) != fetch_list.end()) &&
(node->outputs.size() == 0);
if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
auto old_name = node->Name();
auto new_name = GenerateVarName(node);
node->RenameVar(new_name);
for (auto *op_in : node->inputs) {
op_in->Op()->RenameOutput(old_name, new_name);
}
for (auto *op_out : node->outputs) {
op_out->Op()->RenameInput(old_name, new_name);
}
}
}
}
}
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave IpuInplacePass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
.RequirePassAttr("feed_list")
.RequirePassAttr("fetch_list");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IpuInplacePass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
repr_ = repr;
graph_ = graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class IPUPassBase : public Pass {
public:
void Init(const std::string& repr, Graph* graph) const;
virtual ~IPUPassBase() {}
protected:
mutable Graph* graph_;
mutable std::string repr_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
std::vector<std::string> feed_list;
feed_list = Get<std::vector<std::string>>("feed_list");
std::vector<std::string> fetch_list;
fetch_list = Get<std::vector<std::string>>("fetch_list");
framework::OpDesc ipu_rt_op_desc;
ipu_rt_op_desc.SetType("ipu_runtime");
ipu_rt_op_desc.SetInput("FeedList", feed_list);
ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
ipu_rt_op_desc.Flush();
// Create a new node for the ipu_runtime_op.
auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
for (auto* node : graph->Nodes()) {
if (node->IsVar()) {
for (auto feed : feed_list) {
if (node->Name() == feed) {
IR_NODE_LINK_TO(node, ipu_rt_node);
}
}
for (auto fetch : fetch_list) {
if (node->Name() == fetch) {
IR_NODE_LINK_TO(ipu_rt_node, node);
}
}
}
}
// set ipu_runtime_op dtype attr
if (fetch_list.size() == 1) {
for (auto* node : graph->Nodes()) {
if (node->IsVar()) {
for (auto fetch : fetch_list) {
if (node->Name() == fetch) {
ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
}
}
}
}
}
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op_desc = node->Op();
if (op_desc->Type() != "ipu_runtime") {
marked_nodes.insert(node);
}
}
}
GraphSafeRemoveNodes(graph, marked_nodes);
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(ipu_runtime_replacer_pass,
paddle::framework::ir::IpuRuntimeReplacerPass)
.RequirePassAttr("feed_list")
.RequirePassAttr("fetch_list");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IpuRuntimeReplacerPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace paddle {
namespace framework {
namespace ir {
void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()) {
int op_role = BOOST_GET_CONST(
int, node->Op()->GetAttr(
framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
// graph usually have multiple optimizer node for different parameter,
// and these node have the same type and attr value usually
if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
VLOG(10) << "found optimizer type: " << node->Op()->Type();
for (const std::string& attr_name : node->Op()->AttrNames()) {
auto attr_type = node->Op()->GetAttrType(attr_name);
// with adam, attr are float
if (attr_type == proto::AttrType::FLOAT) {
auto attr_value =
BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
} else {
VLOG(10) << "Skip " << attr_type;
}
}
auto lr_var_name = node->Op()->Input("LearningRate");
PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
platform::errors::InvalidArgument(
"In op(%s), find input(LearningRate) failed.",
node->Op()->Type()));
ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
}
if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
VLOG(10) << "found loss op type: " << node->Op()->Type();
auto outputs = node->Op()->Outputs();
PADDLE_ENFORCE_EQ(
outputs.size(), 1,
platform::errors::InvalidArgument("Can only support one loss key"));
auto losses_name = outputs.begin()->second;
PADDLE_ENFORCE_EQ(losses_name.size(), 1,
platform::errors::InvalidArgument(
"Can only support one loss name"));
ipu_backend->GetExecutor().SetLoss(losses_name[0]);
}
}
}
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(optimizer_extract_pass,
paddle::framework::ir::IpuOptimizerExtractPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IpuOptimizerExtractPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/common.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace paddle {
namespace framework {
namespace ir {
using paddle::platform::ipu::IpuBackend;
using framework::ir::Graph;
using framework::ir::Node;
void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
auto ipu_backend = IpuBackend::GetInstance();
const auto* scope_ = ipu_backend->GetScope();
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()) {
int op_role = BOOST_GET_CONST(
int, node->Op()->GetAttr(
framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
auto inputs = node->Op()->Inputs();
if (inputs.count(platform::ipu::sBeta1Pow)) {
auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
auto beta = BOOST_GET_CONST(
float, node->Op()->GetAttr(platform::ipu::sBeta1));
// ensure current save with beta1pow, rather than step.
// beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
// support single Step__
bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
float step = 0;
float beta_acc = beta;
while (beta_acc > data[0] && save_with_beta1pow) {
beta_acc *= beta;
step += 1;
}
if (save_with_beta1pow) {
data[0] = step;
}
}
}
}
}
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(optimizer_state_align_pass,
paddle::framework::ir::IpuOptimizerStateAlignPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* This pass should only affect optimizer that need bias correction,
* include Adam/Lamb.
*/
class IpuOptimizerStateAlignPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
namespace paddle {
namespace framework {
namespace ir {
using framework::ir::Graph;
using framework::ir::Node;
using platform::ipu::SymbolHandler;
void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
VLOG(10) << "Raw Graph: ";
VLOG(10) << DebugString(graph);
auto nodes = graph->Nodes();
for (auto* node : nodes) {
if (!node->IsOp()) {
continue;
}
auto* op = node->Op();
auto op_type = op->Type();
ir::Node* new_node = nullptr;
SymbolHandler handler = platform::ipu::GetHandler(op_type);
if (handler) {
VLOG(11) << "Raw Paddle Node:";
VLOG(11) << node->Op()->Proto()->DebugString();
new_node = handler(graph, node);
VLOG(11) << "Post Popart Node:";
VLOG(11) << new_node->Op()->Proto()->DebugString();
platform::ipu::ClearNode(node);
graph->RemoveNode(node);
} else {
LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
}
}
VLOG(10) << "Post Graph: ";
VLOG(10) << DebugString(graph);
VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(popart_canonicalization_pass,
paddle::framework::ir::PopartCanonicalizationPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class PopartCanonicalizationPass : public IPUPassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
LOG(WARNING) << "Pass in op compat failed.";
return;
}
const int kNumFields = 5;
const int kTransOffset = 1;
const int kTransOutOffset = 2;
const int kFlattenOffset = 3;
const int kFlattenOutOffset = 4;
std::vector<Node *> nodes;
std::vector<Node *> nodes;
std::vector<int> trans_axis0;
int flatten_axis0;
for (int i = 0; i < times; i++) {
PADDLE_ENFORCE_NOT_NULL(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
......@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
platform::errors::NotFound("Can not find %s in subgraph.",
input_nodes[i]->name()));
if (i == 0) {
trans_axis0 = BOOST_GET_CONST(
std::vector<int>,
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
->Op()
->GetAttr("axis"));
flatten_axis0 = BOOST_GET_CONST(
int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
->Op()
->GetAttr("axis"));
} else {
std::vector<int> trans_axis = BOOST_GET_CONST(
std::vector<int>,
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
->Op()
->GetAttr("axis"));
// All axis of transpose should be the same
if (trans_axis0 != trans_axis) return;
int flatten_axis = BOOST_GET_CONST(
int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
->Op()
->GetAttr("axis"));
// All axis of flatten should be the same
if (flatten_axis0 != flatten_axis) return;
}
nodes.push_back(subgraph.at(input_nodes[i]));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
......
......@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
......
......@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
if (WITH_TESTING)
cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
......
......@@ -29,55 +29,32 @@ namespace paddle {
namespace framework {
namespace paddle2cinn {
using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
: graph_hash_(graph_hash) {}
CinnCacheKey::CinnCacheKey(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const std::string& arch_str) {
const std::string& arch_str, GraphHashStrategy graph_hash)
: graph_hash_(graph_hash) {
this->SetKey(graph, input_tensors, arch_str);
}
CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
const std::map<std::string, DDim>& input_shapes,
const std::string& arch_str) {
const std::string& arch_str,
GraphHashStrategy graph_hash)
: graph_hash_(graph_hash) {
this->SetKey(graph, input_shapes, arch_str);
}
size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
// using Dot to unqiue graph
inference::analysis::Dot dot;
std::unordered_map<const ir::Node*, std::string> node2dot;
int id = 0;
// Create nodes
// graph.Nodes() return unordered_set, the same graph may
// return different result?
for (const ir::Node* n : graph.Nodes()) {
std::string node_id = std::to_string(id++);
dot.AddNode(node_id, {}, n->Name(), true);
node2dot[n] = node_id;
}
// Create edges
for (const ir::Node* n : graph.Nodes()) {
const auto& src_id = node2dot.at(n);
for (auto* out : n->outputs) {
const auto& dest_id = node2dot.at(out);
dot.AddEdge(src_id, dest_id, {});
}
}
const std::string& viz_graph = dot.Build();
VLOG(1) << "The hash graph:\n" << viz_graph;
size_t hash_val = std::hash<std::string>()(viz_graph);
VLOG(4) << "The graph's hash value is: " << hash_val;
return hash_val;
}
void CinnCacheKey::SetKey(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const std::string& arch_str) {
graph_serialize_str_ = std::to_string(HashGraph(graph));
graph_hash_val_ = graph_hash_(graph);
for (const auto& name_tensor : input_tensors) {
input_shapes_[name_tensor.first] = name_tensor.second->dims();
}
......@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
void CinnCacheKey::SetKey(const ir::Graph& graph,
const std::map<std::string, DDim>& input_shapes,
const std::string& arch_str) {
graph_serialize_str_ = std::to_string(HashGraph(graph));
graph_hash_val_ = graph_hash_(graph);
input_shapes_ = input_shapes;
arch_str_ = arch_str;
}
......@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
}
bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
return graph_serialize_str_ == other.graph_serialize_str_ &&
return graph_hash_val_ == other.graph_hash_val_ &&
input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
}
......@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
}
ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
ret = hash_combine(ret, key.graph_hash_val_);
ret = hash_combine(ret, string_hasher(key.arch_str_));
return ret;
}
size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
// sort grad node by name and id.
auto compare = [](ir::Node* n1, ir::Node* n2) {
return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
: (n1->Name() < n2->Name());
};
// graph.Nodes() return unordered_set, here using set to avoid the same graph
// may return different result
std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
output_set(compare);
node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
std::string hash_str;
for (ir::Node* n : node_set) {
hash_str.append(n->Name());
output_set.clear();
output_set.insert(n->outputs.begin(), n->outputs.end());
for (auto* out : output_set) {
hash_str.append(out->Name());
}
}
VLOG(1) << "The hash graph:\n" << hash_str;
size_t hash_val = std::hash<std::string>()(hash_str);
VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
return hash_val;
}
size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
size_t hash_val = reinterpret_cast<size_t>(&graph);
VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
return hash_val;
}
} // namespace paddle2cinn
} // namespace framework
} // namespace paddle
......@@ -14,6 +14,7 @@
#pragma once
#include <functional>
#include <map>
#include "paddle/fluid/framework/ddim.h"
......@@ -33,14 +34,18 @@ namespace paddle2cinn {
// shapes.
class CinnCacheKey {
public:
using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
explicit CinnCacheKey(GraphHashStrategy graph_hash);
CinnCacheKey(const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const std::string& arch_str);
const std::string& arch_str, GraphHashStrategy graph_hash);
CinnCacheKey(const ir::Graph& graph,
const std::map<std::string, DDim>& input_shapes,
const std::string& arch_str);
const std::string& arch_str, GraphHashStrategy graph_hash);
~CinnCacheKey() {}
~CinnCacheKey() = default;
void SetKey(const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
......@@ -58,13 +63,38 @@ class CinnCacheKey {
};
private:
size_t HashGraph(const ir::Graph& graph);
std::string graph_serialize_str_;
GraphHashStrategy graph_hash_;
size_t graph_hash_val_;
std::map<std::string, DDim> input_shapes_;
std::string arch_str_;
};
#define CINN_CACHE_KEY_CREATE(NAME) \
class NAME : public CinnCacheKey { \
public: \
NAME() : CinnCacheKey(HashGraph) {} \
\
NAME(const ir::Graph& graph, \
const std::map<std::string, const LoDTensor*>& input_tensors, \
const std::string& arch_str) \
: CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {} \
\
NAME(const ir::Graph& graph, \
const std::map<std::string, DDim>& input_shapes, \
const std::string& arch_str) \
: CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {} \
\
private: \
static size_t HashGraph(const ir::Graph& graph); \
};
// Class to store the keys by graph address for compiling CINN.
CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
// Class to store the keys by graph structure for compiling CINN.
CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
#undef CINN_CACHE_KEY_CREATE
} // namespace paddle2cinn
} // namespace framework
} // namespace paddle
......@@ -26,8 +26,8 @@ namespace paddle {
namespace framework {
namespace paddle2cinn {
TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;
ProgramDesc empty_program;
ir::Graph empty_graph(empty_program);
......@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
DDim ddim = paddle::framework::make_ddim({1, 2, 3});
std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
EXPECT_EQ(cache_key0, cache_key1);
CinnCacheKey cache_key2(graph, feed_shapes, "x86");
CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
EXPECT_NE(cache_key2, cache_key3);
EXPECT_EQ(cache_key3, cache_key4);
CinnCacheKey cache_key5(empty_graph,
std::map<std::string, const LoDTensor *>(), "unk");
CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
CinnCacheKeyByStructure cache_key5(
empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
"unk");
EXPECT_EQ(cache_key5, cache_key6);
EXPECT_NE(cache_key1, cache_key3);
......@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
EXPECT_EQ(test_set.find(cache_key6), test_set.end());
}
TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
ProgramDesc empty_program;
ir::Graph empty_graph(empty_program);
ProgramDesc program;
auto *global_block = program.MutableBlock(0);
auto *x = global_block->Var("X");
x->SetType(proto::VarType::LOD_TENSOR);
ir::Graph graph(program);
LoDTensor tensor;
tensor.Resize({1, 2, 3});
const LoDTensor *tensor_pointer = &tensor;
std::map<std::string, const LoDTensor *> feed_tensors = {
{"X", tensor_pointer}};
DDim ddim = paddle::framework::make_ddim({1, 2, 3});
std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
EXPECT_EQ(cache_key0, cache_key1);
CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
EXPECT_NE(cache_key2, cache_key3);
EXPECT_EQ(cache_key3, cache_key4);
CinnCacheKeyByAddress cache_key5(
empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
"unk");
EXPECT_EQ(cache_key5, cache_key6);
EXPECT_NE(cache_key1, cache_key3);
EXPECT_NE(cache_key4, cache_key2);
EXPECT_NE(cache_key3, cache_key5);
EXPECT_NE(cache_key6, cache_key4);
EXPECT_NE(cache_key5, cache_key1);
EXPECT_NE(cache_key2, cache_key6);
test_set.insert(cache_key0);
test_set.insert(cache_key1);
test_set.insert(cache_key3);
test_set.insert(cache_key4);
test_set.insert(cache_key5);
test_set.insert(cache_key6);
EXPECT_EQ(test_set.size(), 3U);
auto iter = test_set.find(cache_key0);
EXPECT_NE(iter, test_set.end());
test_set.erase(iter);
EXPECT_EQ(test_set.size(), 2U);
EXPECT_EQ(test_set.find(cache_key1), test_set.end());
iter = test_set.find(cache_key3);
EXPECT_NE(iter, test_set.end());
test_set.erase(iter);
EXPECT_EQ(test_set.size(), 1U);
EXPECT_EQ(test_set.find(cache_key4), test_set.end());
iter = test_set.find(cache_key5);
EXPECT_NE(iter, test_set.end());
test_set.erase(iter);
EXPECT_EQ(test_set.size(), 0U);
EXPECT_EQ(test_set.find(cache_key6), test_set.end());
}
TEST(CinnCacheKeyTest, TestSameGraph) {
ProgramDesc program1;
auto *global_block1 = program1.MutableBlock(0);
auto *x1 = global_block1->Var("X");
x1->SetType(proto::VarType::LOD_TENSOR);
ir::Graph graph1(program1);
ProgramDesc program2;
auto *global_block2 = program2.MutableBlock(0);
auto *x2 = global_block2->Var("X");
x2->SetType(proto::VarType::LOD_TENSOR);
ir::Graph graph2(program2);
LoDTensor tensor;
tensor.Resize({1, 2, 3});
const LoDTensor *tensor_pointer = &tensor;
std::map<std::string, const LoDTensor *> feed_tensors = {
{"X", tensor_pointer}};
CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
}
} // namespace paddle2cinn
} // namespace framework
} // namespace paddle
......@@ -41,6 +41,7 @@
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
const std::map<std::string, const LoDTensor*>& input_tensors,
const Target& target, void* stream) {
VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
target.arch_str());
CinnCacheKeyByStructure cur_key_by_struct;
bool exist = false;
{
AutoRDLock r_guard{&rwlock_};
exist = cache_.count(cur_key) != 0;
exist = cache_by_address_.count(cur_key_by_address) != 0;
// if cannot find graph by address, checkout whether the graph structure
// have been stored in cache.
if (!exist) {
// generate the structure cache key
cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
// if the graph structure can be found, storing the graph address in
// cache for next query.
if (cache_by_struct_.count(cur_key_by_struct) != 0) {
exist = true;
cache_by_address_[cur_key_by_address] =
cache_by_struct_.at(cur_key_by_struct).get();
}
}
}
if (!exist) {
std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
auto compiled_res =
CompileGraph(graph, input_tensors, target, compiled_num, stream);
AutoWRLock w_guard{&rwlock_};
if (!cache_.count(cur_key)) {
cache_[cur_key] = std::move(compiled_res);
if (!cache_by_struct_.count(cur_key_by_struct)) {
cache_by_address_[cur_key_by_address] = compiled_res.get();
cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
}
}
AutoRDLock guard{&rwlock_};
const auto& cached_boj = *cache_[cur_key];
const auto& cached_boj = *cache_by_address_[cur_key_by_address];
return cached_boj;
}
......@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
{
AutoWRLock guard{&rwlock_};
graphs_.clear();
cache_.clear();
cache_by_address_.clear();
cache_by_struct_.clear();
}
real_compiled_num_.store(0);
}
......@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
*compiled_obj = {std::move(graph_compiler),
std::move(compiled_res.runtime_program), scope,
symbol.var_model_to_program_map()};
compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(
compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
return compiled_obj;
}
......
......@@ -31,6 +31,13 @@
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace details {
class CinnLaunchContext;
} // namespace details
} // namespace operators
namespace framework {
namespace paddle2cinn {
......@@ -39,6 +46,7 @@ struct CinnCompiledObject {
std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
std::shared_ptr<::cinn::hlir::framework::Scope> scope;
std::unordered_map<std::string, std::string> paddle2cinn_varmap;
std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
};
// Entrance to use CINN.
......@@ -87,9 +95,12 @@ class CinnCompiler {
void* stream = nullptr) const;
std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
CinnCacheKey::Hash>
cache_;
cache_by_address_;
std::unordered_map<CinnCacheKeyByStructure,
std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
cache_by_struct_;
std::atomic_int64_t real_compiled_num_{0};
mutable RWLock rwlock_;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <sstream>
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
}
KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
return KernelSignature(op_proto_->type(), GetInputArgsNames(),
GetAttrsArgsNames(), GetOutputArgsNames());
return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
GetInputArgsNames(), GetAttrsArgsNames(),
GetOutputArgsNames());
}
std::string KernelSignatureToString(const KernelSignature& signature) {
......
......@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_ipu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
......@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
}
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
......@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else { // NOLINT
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
......@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
platform::errors::Unimplemented("Not supported on place (%s) ", npu));
// return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::IPUPlace& ipu) const {
PADDLE_THROW(
platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPinnedPlace& cpu) const {
......@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
void VisitorImpl(const platform::XPUPlace& xpu) const {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
void VisitorImpl(const platform::IPUPlace& ipu) const {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
void VisitorImpl(const platform::CUDAPlace& gpu) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
"is not supported in imperative mode",
place));
}
// there is NO support in IPUPlace
void operator()(const platform::IPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private:
int64_t numel_;
......
......@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
op.Type(), outs, dev_ctx->GetPlace());
}
if (FLAGS_benchmark) {
dev_ctx->Wait();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
#endif
}
/**
* [ Why need handle complex gradient to real gradient? ]
*
......
......@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
#endif
// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
// If later the operators::StridedMemcpyWithAxis0 is supported,
// then this specific SplitTensorsForAllReduce can be removed.
#ifdef PADDLE_WITH_ASCEND_CL
template <>
void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
const platform::NPUDeviceContext &context,
framework::Variable *p_dense_contents,
std::vector<framework::Tensor> *p_dense_tensors) {
auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
std::vector<framework::Tensor *> outs;
std::vector<const framework::Tensor *> shape_refer;
outs.reserve(p_dense_tensors->size());
shape_refer.reserve(p_dense_tensors->size());
for (auto &tensor : *p_dense_tensors) {
outs.emplace_back(&tensor);
shape_refer.emplace_back(&tensor);
}
operators::math::SplitFunctor<platform::NPUDeviceContext, float>
split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs);
}
template <>
void ConcatTensorsWithType<platform::NPUDeviceContext>(
const platform::NPUDeviceContext &context,
const std::vector<framework::Tensor> &dense_tensors_,
framework::Variable *p_dense_contents,
framework::proto::VarType::Type type) {
switch (type) {
case framework::proto::VarType::FP32:
ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
context, dense_tensors_, p_dense_contents);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it concats tensors for "
"allreduce.",
framework::DataTypeToString(type)));
}
}
template <>
void SplitTensorsWithType<platform::NPUDeviceContext>(
const platform::NPUDeviceContext &context,
framework::Variable *p_dense_contents,
std::vector<framework::Tensor> *p_dense_tensors,
framework::proto::VarType::Type type) {
switch (type) {
case framework::proto::VarType::FP32:
SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
context, p_dense_contents, p_dense_tensors);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it splits tensors for "
"allreduce.",
framework::DataTypeToString(type)));
}
}
#endif
void Group::ConcatTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace();
if (platform::is_gpu_place(place)) {
......
......@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_allocator_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
return iter->second->allocators_;
} else {
return allocators_;
......@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
#endif
#if defined(PADDLE_WITH_CUDA)
......
......@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For Graphcore IPU
template <>
void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
VLOG(10) << "IPUPlace, Allocate on cpu.";
void *p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
return GetCPUBuddyAllocator()->Release();
}
template <>
size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For kunlun XPU
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
......
......@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
[](const auto& name_view) { return std::string(name_view.data()); });
}
bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
return paddle2cinn_varmap_.count(paddle_name) > 0 &&
cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
const platform::Place& place) {
if (std::addressof(scope) == cached_scope_ &&
std::addressof(place) == cached_place_) {
VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
<< " are not changed";
return;
}
cached_scope_ = std::addressof(scope);
cached_place_ = std::addressof(place);
cached_temp_scope_ = scope.NewTmpScope();
VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
<< std::addressof(scope) << ", place:" << cached_place_ << "->"
<< std::addressof(place);
}
bool CinnLaunchContext::IsArgumentsInitialized() const {
if (hold_buffers_.empty() || name2argument_.empty()) {
return false;
}
return true;
}
bool CinnLaunchContext::IsVariableUsed(
const std::string& paddle_var_name) const {
return paddle2cinn_varmap_.count(paddle_var_name) > 0 &&
cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_var_name)) >
0;
}
CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& var_name) {
......@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
return all_parameters;
}
void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
const LoDTensor& paddle_tensor,
const CinnTensor& cinn_tensor) {
void CinnLaunchContext::CheckTensorEquivalent(
const std::string& paddle_var_name, const LoDTensor& paddle_tensor,
const CinnTensor& cinn_tensor) {
// check dimension
auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
platform::errors::PreconditionNotMet(
"Tensors' shape in variable(%s) are not equivalent, "
"paddle's shape = [%s], but cinn's shape = [%s].",
paddle_name, paddle_tensor.dims(), cinn_dims));
paddle_var_name, paddle_tensor.dims(), cinn_dims));
// TODO(CtfGo): check the underlying data type after CINN ready
}
void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
const platform::Place& place,
LoDTensor* paddle_tensor) {
PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
platform::errors::InvalidArgument(
"Paddle variable(%s) not used by cinn", paddle_name));
const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
if (!paddle_tensor->IsInitialized()) {
paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
void CinnLaunchContext::AssignExternalVariable(
const std::string& paddle_var_name) {
PADDLE_ENFORCE_EQ(
IsVariableUsed(paddle_var_name), true,
platform::errors::InvalidArgument("Paddle variable(%s) not used by cinn",
paddle_var_name));
const auto& cinn_var_name = paddle2cinn_varmap_.at(paddle_var_name);
const auto& paddle_tensor =
cached_scope_->GetVar(paddle_var_name)->Get<LoDTensor>();
CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
if (paddle_tensor.IsInitialized()) {
CheckTensorEquivalent(paddle_var_name, paddle_tensor, cinn_tensor);
}
CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
paddle_tensor);
}
void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
const platform::Place& place,
LoDTensor* paddle_tensor) {
PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
platform::errors::InvalidArgument(
"Variable(%s) not found in cinn socpe.", cinn_name));
CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
if (!paddle_tensor->IsInitialized()) {
paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
}
CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
paddle_tensor);
}
auto cinn_buffer = std::make_unique<cinn_buffer_t>();
// assign dimensions and alloc/free callback of cinn_buffer_t
cinn_buffer->resize(cinn_tensor->shape().data().data(),
cinn_tensor->shape().data().size());
cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
[this, paddle_var_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_scope_->GetVar(paddle_var_name)->GetMutable<LoDTensor>();
tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
buffer->memory = reinterpret_cast<uint8_t*>(
tensor->mutable_data<float>(*cached_place_));
return 0;
});
std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
// convert paddle dimensions array to cinn format
std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
for (auto i = 0; i < tensor->dims().size(); ++i) {
cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
}
// external variables will be recycled by global gc, so do nothing here
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[](void* ctx, cinn_buffer_t* buffer) {
// Do nothing
return 0;
});
return SetArgument(cinn_var_name, std::move(cinn_buffer));
}
void CinnLaunchContext::AssignInternalVariable(
const std::string& cinn_var_name) {
PADDLE_ENFORCE_GT(
cinn_variable_names_.count(cinn_var_name), 0,
platform::errors::InvalidArgument("Variable(%s) not found in cinn socpe.",
cinn_var_name));
CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
auto cinn_buffer = std::make_unique<cinn_buffer_t>();
// assign size and memory
cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
// assign dimensions and alloc/free callback of cinn_buffer_t
cinn_buffer->resize(cinn_tensor->shape().data().data(),
cinn_tensor->shape().data().size());
cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
[place, tensor](void* ctx, cinn_buffer_t* buffer) {
buffer->memory =
reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
[this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_temp_scope_->Var(cinn_var_name)->GetMutable<LoDTensor>();
tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
buffer->memory = reinterpret_cast<uint8_t*>(
tensor->mutable_data<float>(*cached_place_));
return 0;
});
if (free_mem_callback) {
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[tensor](void* ctx, cinn_buffer_t* buffer) {
tensor->clear();
return 0;
});
return cinn_buffer;
}
// internal variables should release its buffer immediately
// if no instruction use it
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[](void* ctx, cinn_buffer_t* buffer) {
// Do nothing
[this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_temp_scope_->GetVar(cinn_var_name)->GetMutable<LoDTensor>();
tensor->clear();
return 0;
});
return cinn_buffer;
return SetArgument(cinn_var_name, std::move(cinn_buffer));
}
void CinnLaunchContext::SetArgument(const std::string& cinn_name,
const platform::Place& place,
bool free_mem_callback,
LoDTensor* paddle_tensor) {
auto buffer =
ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
name2argument_.emplace(cinn_name, buffer.get());
void CinnLaunchContext::SetArgument(const std::string& cinn_var_name,
std::unique_ptr<cinn_buffer_t>&& buffer) {
VLOG(4) << "SetArgument-" << name2argument_.size() << ": name("
<< cinn_var_name << "), dims("
<< framework::DDim(buffer->dims, buffer->dimensions) << ").";
name2argument_.emplace(cinn_var_name, buffer.get());
hold_buffers_.emplace_back(std::move(buffer));
VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
<< "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
<< ").";
}
const std::map<std::string, cinn_pod_value_t>&
......
......@@ -24,7 +24,7 @@
#include "cinn/runtime/cinn_runtime.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace operators {
......@@ -40,16 +40,22 @@ class CinnLaunchContext {
const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
const std::shared_ptr<CinnScope>& cinn_scope);
// explicitly update several environment variables captured
// by callback of execution arguments
void UpdateCapturedEnv(const framework::Scope& scope,
const platform::Place& place);
// Return whether execution arguments has been initialized
bool IsArgumentsInitialized() const;
// Return whether a Paddle variable used on compiled kernels
bool IsVariableUsed(const std::string& var_name);
bool IsVariableUsed(const std::string& paddle_var_name) const;
// Assign tensor buffer to input or output variables
void AssignExternalVariable(const std::string& var_name,
const platform::Place& place, LoDTensor* tensor);
void AssignExternalVariable(const std::string& paddle_var_name);
// Assign tensor buffer to internal variables
void AssignInternalVariable(const std::string& var_name,
const platform::Place& place, LoDTensor* tensor);
void AssignInternalVariable(const std::string& cinn_var_name);
// Extract internal variable names from CinnScope
// by excluding used input and output variables
......@@ -58,10 +64,6 @@ class CinnLaunchContext {
// Finalize all execution arguments and return them
const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
return std::move(hold_buffers_);
}
private:
// Get CinnTensor with CINN variable name
CinnTensor GetCinnTensor(const std::string& var_name);
......@@ -72,16 +74,15 @@ class CinnLaunchContext {
const LoDTensor& paddle_tensor,
const CinnTensor& cinn_tensor);
// Share the buffer of a Paddle tensor to CINN by delivering memory address
// to a cinn_buffer_t object
std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
// Set an argument with (cinn name)->(paddle tensor) pair
void SetArgument(const std::string& cinn_name, const platform::Place& place,
bool free_mem_callback, LoDTensor* paddle_tensor);
// Set an argument with (cinn name)->(cinn_buffer_t) pair
void SetArgument(const std::string& cinn_var_name,
std::unique_ptr<cinn_buffer_t>&& buffer);
private:
const framework::Scope* cached_scope_ = nullptr;
const platform::Place* cached_place_ = nullptr;
std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
// a variable name map from paddle to cinn
const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
// the variable scope of cinn
......
......@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
}
TEST(CinnLaunchContextTest, TestIsVariableUsed) {
TEST(CinnLaunchContextTest, TestBasic) {
auto launch_context = CreateDefaultLaunchContext();
// test IsVariableUsed
ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
}
TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
auto launch_context = CreateDefaultLaunchContext();
auto internal_variable_names = launch_context->GetInternalVariableNames();
ASSERT_EQ(internal_variable_names.size(), 3);
EXPECT_NE(internal_variable_names.find("cinn_var2"),
internal_variable_names.end());
// test UpdateCapturedEnv
platform::CPUPlace place;
framework::Scope scope;
ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
// test IsArgumentsInitialized
ASSERT_FALSE(launch_context->IsArgumentsInitialized());
}
TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
auto launch_context = CreateDefaultLaunchContext();
platform::CPUPlace place;
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
// CheckTensorEquivalent: tensor dimension not equivalent
tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
paddle::platform::EnforceNotMet);
}
TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
auto launch_context = CreateDefaultLaunchContext();
platform::CPUPlace place;
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
// not used
ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
paddle::platform::EnforceNotMet);
// not found
ASSERT_THROW(
launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
paddle::platform::EnforceNotMet);
ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
paddle::platform::EnforceNotMet);
}
TEST(CinnLaunchContextTest, TestSetArgument) {
platform::CPUPlace cpu_place;
platform::Place place(cpu_place);
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
platform::CPUPlace place;
framework::Scope scope;
// assign external variables
auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
float* data1 =
tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
data1[0] = 9.99f;
data1[10] = 19.99f;
ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
// assign external variable
ASSERT_NO_THROW(
launch_context->AssignExternalVariable("var1", place, tensor1));
auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
ASSERT_NO_THROW(
launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
// FinalizeArguments not missed check
ASSERT_THROW(launch_context->FinalizeArguments(),
paddle::platform::EnforceNotMet);
auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
ASSERT_NO_THROW(
launch_context->AssignExternalVariable("var3", place, tensor3));
ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
// FinalizeArguments missed check
ASSERT_THROW(launch_context->FinalizeArguments(),
paddle::platform::EnforceNotMet);
// test get internal variables
auto internal_variable_names = launch_context->GetInternalVariableNames();
ASSERT_EQ(internal_variable_names.size(), 1);
EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
// check argument is set correctly and alloc/free callbacks work well
auto name2argument = launch_context->FinalizeArguments();
ASSERT_EQ(name2argument.size(), 3);
ASSERT_EQ(name2argument.count("cinn_var1"), 1);
// check ShareTensorWithCinnBuffer
ASSERT_TRUE(launch_context->IsArgumentsInitialized());
auto* cinn_buffer =
static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
ASSERT_EQ(cinn_buffer->memory, nullptr);
cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
ASSERT_NE(cinn_buffer->memory, nullptr);
......
......@@ -31,26 +31,6 @@ namespace operators {
namespace details {
#ifdef PADDLE_WITH_CUDA
void CUDART_CB ReleaseScope(void* data) {
auto* temp_scope = static_cast<framework::Scope*>(data);
delete temp_scope;
}
void CUDART_CB ReleaseBuffers(void* data) {
auto* buffers =
static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
delete buffers;
}
template <>
void ReleaseResource<platform::CUDADeviceContext>(
const std::vector<void*>& resources, void* stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
}
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx) {
......
......@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
void SetCinnRuntimeFlags();
template <typename DeviceContext>
void ReleaseResource(const std::vector<void*>& resources, void* stream) {
auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
auto* buffers =
static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
delete temp_scope;
delete buffers;
}
template <typename DeviceContext>
void* GetStream(const framework::ExecutionContext& ctx) {
return nullptr;
}
#ifdef PADDLE_WITH_CUDA
template <>
void ReleaseResource<platform::CUDADeviceContext>(
const std::vector<void*>& resources, void* stream);
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx);
......@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object);
auto launch_context = std::make_unique<details::CinnLaunchContext>(
cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
auto* launch_context = cinn_compiled_object.launch_context.get();
// Step 3. Prepare arguments needed for the compiled executable program.
VLOG(4) << "CinnLaunchOp prepare arguments";
// 3.1 Prepare input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_variable_names) {
if (!launch_context->IsVariableUsed(var_name)) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
continue;
launch_context->UpdateCapturedEnv(scope, place);
if (!launch_context->IsArgumentsInitialized()) {
VLOG(4) << "CinnLaunchOp prepare arguments";
// 3.1 Prepare input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_variable_names) {
if (!launch_context->IsVariableUsed(var_name)) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
continue;
}
launch_context->AssignExternalVariable(var_name);
}
launch_context->AssignExternalVariable(
var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
}
// 3.2 Prepare output variables: all output variables should
// be initialized and allocated buffer before
// the runtime program start execution, the compilation result
// includes details of their buffer assginment and we use that to
// allocate space in Paddle. For those variables allocated yet,
// like persistable parameters, just check the equiality between
// Paddle allocation and CINN buffer assginment.
auto output_variable_names = ctx.OutputNames(kOutputs);
for (const auto var_name : output_variable_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
platform::errors::InvalidArgument(
"Output variable(%s) not used by cinn", var_name));
auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
launch_context->AssignExternalVariable(var_name, place, tensor);
}
// 3.2 Prepare output variables: all output variables should
// be initialized and allocated buffer before
// the runtime program start execution, the compilation result
// includes details of their buffer assginment and we use that to
// allocate space in Paddle. For those variables allocated yet,
// like persistable parameters, just check the equiality between
// Paddle allocation and CINN buffer assginment.
auto output_variable_names = ctx.OutputNames(kOutputs);
for (const auto var_name : output_variable_names) {
PADDLE_ENFORCE_EQ(
launch_context->IsVariableUsed(var_name), true,
platform::errors::InvalidArgument(
"Output variable(%s) not used by cinn", var_name));
launch_context->AssignExternalVariable(var_name);
}
// 3.3 Prepare internal or temporary variables: Create a temporary
// scope to keep internal variables within graph or temporary
// variables needed by the compiled runtime program in addition.
// Here we directly use the names from CinnScope as Paddle variable
// names, because they will not be used outside the graph
// and should be destructed after computation finished.
auto internal_variable_names = launch_context->GetInternalVariableNames();
framework::Scope* temp_scope = scope.NewTmpScope().release();
for (const auto& var_name : internal_variable_names) {
auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
launch_context->AssignInternalVariable(var_name, place, tensor);
// 3.3 Prepare internal or temporary variables: Create a temporary
// scope to keep internal variables within graph or temporary
// variables needed by the compiled runtime program in addition.
// Here we directly use the names from CinnScope as Paddle variable
// names, because they will not be used outside the graph
// and should be destructed after computation finished.
auto internal_variable_names = launch_context->GetInternalVariableNames();
for (const auto& var_name : internal_variable_names) {
launch_context->AssignInternalVariable(var_name);
}
}
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
......@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
VLOG(4) << "CinnLaunchOp launch execution done.";
// Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
launch_context->HandoverBuffers()};
details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
stream);
}
};
......
......@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
scope.Var(test_out_name)->GetMutable<LoDTensor>();
scope.Var(expected_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place);
platform::Place run_place(place);
cinn_launch_op->Run(scope, run_place);
elementwise_add_op->Run(scope, run_place);
LoDTensor test_out, expected_out;
TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/complex_view_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
class AsComplexOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_complex");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_complex");
auto in_dims = ctx->GetInputDim("X");
const int input_rank = in_dims.size();
PADDLE_ENFORCE_GE(
input_rank, 1,
platform::errors::InvalidArgument(
"The rank of input(X) is less than 1. "
"Expected the rank of input(X) to be equal to or greater than 1."
"But received rank of input(X) = %d",
input_rank));
const int last_dim_size = in_dims[input_rank - 1];
PADDLE_ENFORCE_EQ(
last_dim_size, 2,
platform::errors::InvalidArgument(
"The size of the last dimension of input(X)"
"does not equals 2."
"Expected the size of last dimension of input(X) to be 2."
"But received %d",
last_dim_size));
const framework::DDim out_dims(in_dims.Get(), input_rank - 1);
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor), The input tensor of view_as_complex op.");
AddOutput("Out", "(Tensor), The output tensor of view_as_complex op.");
AddComment(R"DOC(
As_complex Operator.
This operator is used to return a complex tensor represented
by an old-fashioned real tensor. The size of the last dimension of
the input tensor should be 2, which corresponds to 'real' and
'complex', respectively.
)DOC");
}
};
template <typename T>
class AsComplexGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
void Apply(GradOpPtr<T> retv) const override {
retv->SetType("as_real");
retv->SetInput("X", this->OutputGrad("Out"));
retv->SetAttrMap(this->Attrs());
retv->SetOutput("Out", this->InputGrad("X"));
}
};
class AsRealOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
auto out_dims_v = framework::vectorize(ctx->GetInputDim("X"));
out_dims_v.push_back(2);
const framework::DDim out_dims = framework::make_ddim(out_dims_v);
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(framework::ToRealType(input_data_type),
ctx.GetPlace());
}
};
class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor), The input tensor of as_real op.");
AddOutput("Out", "(Tensor), The output tensor of as_real op.");
AddComment(R"DOC(
AsReal Operator.
This operator is used to return an old-fashioned real tensor from a
complex tensor. The size of the last dimension of the output tensor is 2,
which corresponds to 'real' and 'complex', respectively.
)DOC");
}
};
template <typename T>
class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
void Apply(GradOpPtr<T> retv) const override {
retv->SetType("as_complex");
retv->SetInput("X", this->OutputGrad("Out"));
retv->SetAttrMap(this->Attrs());
retv->SetOutput("Out", this->InputGrad("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(as_complex, ops::AsComplexOp, ops::AsComplexOpMaker,
ops::AsComplexGradMaker<paddle::framework::OpDesc>,
ops::AsComplexGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker,
ops::AsRealGradMaker<paddle::framework::OpDesc>,
ops::AsRealGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
as_complex, ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
as_real, ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/complex_view_op.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/enforce.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
as_complex,
ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
as_real, ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/complex_functors.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AsComplexKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<framework::LoDTensor>("X");
auto* out = context.Output<framework::LoDTensor>("Out");
out->mutable_data<platform::complex<T>>(context.GetPlace());
// TensorCopy also changes output's shape & dtype
const framework::DDim out_dims_original = out->dims();
framework::TensorCopy(*x, context.GetPlace(), out);
out->Resize(out_dims_original); // restored the shape
out->mutable_data<platform::complex<T>>(
context.GetPlace()); // restore the dtype
}
};
template <typename DeviceContext, typename T>
class AsRealKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<framework::LoDTensor>("X");
auto* out = context.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(context.GetPlace());
const framework::DDim out_dims_original = out->dims();
framework::TensorCopy(*x, context.GetPlace(), out);
out->Resize(out_dims_original); // restored the shape
out->mutable_data<T>(context.GetPlace()); // restore the dtype
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/ipu_runtime_op.h"
namespace paddle {
namespace operators {
class IpuRuntimeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.device_context());
}
};
class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("FeedList", "FeedList of Graph").AsDuplicable();
AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
AddAttr<int>("dtype",
"(int, default 5 (FP32)) "
"Output data type")
.SetDefault(framework::proto::VarType::FP32);
AddComment(R"DOC(
Run graph by PopART runtime.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
ops::IpuRuntimeKernel<double>,
ops::IpuRuntimeKernel<int>,
ops::IpuRuntimeKernel<int64_t>,
ops::IpuRuntimeKernel<bool>,
ops::IpuRuntimeKernel<int8_t>,
ops::IpuRuntimeKernel<paddle::platform::float16>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/framework/ipu/ipu_backend.h"
#include "paddle/fluid/framework/tensor.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class IpuRuntimeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#ifdef PADDLE_WITH_IPU
auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
if (!ipu_backend->DeviceIsAttached()) {
const platform::IPUDeviceContext& ipu_ctx =
reinterpret_cast<const platform::IPUDeviceContext&>(
ctx.device_context());
ipu_backend->AttachDevice(ipu_ctx.DeviceId());
}
auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
auto output_names = ctx.OutputNames("FetchList");
VLOG(4) << "IpuRuntime Kernel, begin to run graph";
ipu_backend->Run(inputs, outputs, ctx);
// post-run
// resize tensor when tensor.dims() is empty
for (size_t i = 0; i < outputs.size(); ++i) {
auto* out = outputs[i];
if (out->dims().size() == 0) {
auto tensor_dtype = out->type();
auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
int64_t dim = out->memory_size() / sizeof_dtype;
out->Resize({dim});
VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
<< " dims from () to: "
<< "(" << dim << ")";
}
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Please compile WITH_IPU option to enable ipu_runtime op"));
#endif
}
};
} // namespace operators
} // namespace paddle
......@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<platform::IPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
......
......@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
auto data_type = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
return framework::OpKernelType(data_type, ctx.device_context());
}
......
......@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
const framework::ExecutionContext &ctx) const override {
auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
if (multi_inputs.size() > 0) {
return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
{}, {"Out"});
} else if (ctx.HasInput("Shape")) {
return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
{"Out"});
} else {
return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
......
......@@ -18,6 +18,7 @@ namespace paddle {
namespace platform {
std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
void CUDAGraph::Reset() {
if (is_reset_) return;
......@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
IsCapturing(), true,
errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
"Graph is capturing."));
if (IsThreadLocalCapturing()) {
PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
platform::errors::PermissionDenied(
"When capturing CUDA Graph in the thread local mode, "
"you cannot begin segmented capturing in the thread "
"which is not the one that starts the capturing."));
}
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
capturing_graph_->stream_, capturing_graph_->capture_mode_));
PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
......@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
capturing_graph_->place_ = place;
capturing_graph_->stream_ = stream;
capturing_graph_->capture_mode_ = mode;
if (mode == cudaStreamCaptureModeThreadLocal) {
capturing_thread_id_ = std::this_thread::get_id();
VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
<< capturing_thread_id_;
}
BeginSegmentCapture();
#endif
}
......@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
EndSegmentCapture();
capturing_thread_id_ = paddle::none;
return std::move(capturing_graph_);
}
......
......@@ -18,6 +18,7 @@
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
#include "cuda.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
......@@ -26,6 +27,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/utils/optional.h"
namespace paddle {
namespace platform {
......@@ -99,6 +101,25 @@ class CUDAGraph {
// supported during capturing CUDA Graph.
static bool IsValidCapturing();
static bool IsThreadLocalCapturing() {
#if CUDA_VERSION >= 10010
return IsCapturing() &&
capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
#else
return false;
#endif
}
static bool IsThisThreadCapturing() {
if (UNLIKELY(IsCapturing())) {
return IsThreadLocalCapturing()
? capturing_thread_id_.get() == std::this_thread::get_id()
: true;
} else {
return false;
}
}
private:
static CUDAGraphID UniqueID() {
static std::atomic<CUDAGraphID> id;
......@@ -118,6 +139,7 @@ class CUDAGraph {
bool is_reset_{false};
std::mutex mtx_;
static paddle::optional<std::thread::id> capturing_thread_id_;
static std::unique_ptr<CUDAGraph> capturing_graph_;
};
......
......@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
return *reinterpret_cast<float16 *>(&x);
}
static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
return *reinterpret_cast<__half *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, float16) {
return CUDAFP16ToPDFP16(
atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, float16) {
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
......@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
}
}
#endif
#endif
CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address);
......
# IPU
IF(WITH_IPU)
FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
cc_library(ipu_device SRCS device.cc DEPS enforce popart)
cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace ipu {
namespace {
Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
node->outputs);
return new_node;
}
Node *relu_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_relu");
}
Node *tanh_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_tanh");
}
Node *log_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_log");
}
Node *sigmoid_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_sigmoid");
}
Node *sqrt_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_sqrt");
}
Node *gelu_handler(Graph *graph, Node *node) {
return activation_op_handler(graph, node, "popart_gelu_v2");
}
Node *log_softmax_handler(Graph *graph, Node *node) {
auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
node->outputs);
}
REGISTER_HANDLER(relu, relu_handler);
REGISTER_HANDLER(tanh, tanh_handler);
REGISTER_HANDLER(log, log_handler);
REGISTER_HANDLER(sigmoid, sigmoid_handler);
REGISTER_HANDLER(sqrt, sqrt_handler);
REGISTER_HANDLER(gelu, gelu_handler);
REGISTER_HANDLER(log_softmax, log_softmax_handler);
} // namespace
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
namespace paddle {
namespace platform {
namespace ipu {
// This avoids the static initialisation order fiasco,
std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
return symbol_handlers;
}
bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
if (SymbolHandlers().count(symbol) != 0) {
LOG(WARNING) << "Trying to register popart handler twice for operator: "
<< symbol;
return false;
}
bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
return new_handler;
}
// Return a pointer to a handler if one is registered for this kind of node or
// an empty std::function otherwise.
SymbolHandler GetHandler(const std::string &kind) {
auto it = SymbolHandlers().find(kind);
if (it != SymbolHandlers().end()) {
return it->second;
}
return {};
}
void ConnectNodes(Node *first_node, Node *next_node) {
first_node->outputs.push_back(next_node);
next_node->inputs.push_back(first_node);
}
void DisConnectNodes(Node *first_node, Node *next_node) {
auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
};
rm_by_value(first_node->outputs, next_node);
rm_by_value(next_node->inputs, first_node);
rm_by_value(first_node->inputs, next_node);
rm_by_value(next_node->outputs, first_node);
}
void ClearNode(Node *node) {
auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
};
for (auto *node_in : node->inputs) {
rm_by_value(node_in->outputs, node);
}
for (auto *node_out : node->outputs) {
rm_by_value(node_out->inputs, node);
}
}
void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
bool override) {
if (new_op->HasAttr(attr_name) && !override) {
return;
}
if (op->HasAttr(attr_name)) {
VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
<< " to " << new_op->Type();
new_op->SetAttr(attr_name, op->GetAttr(attr_name));
new_op->Flush();
}
}
const int VarType2OnnxDtype(const int type) {
auto dtype = static_cast<framework::proto::VarType::Type>(type);
switch (dtype) {
case framework::proto::VarType::BOOL:
return static_cast<int>(ONNXDataType::BOOL);
case framework::proto::VarType::INT16:
return static_cast<int>(ONNXDataType::INT16);
case framework::proto::VarType::INT32:
return static_cast<int>(ONNXDataType::INT32);
case framework::proto::VarType::INT64:
return static_cast<int>(ONNXDataType::INT64);
case framework::proto::VarType::FP16:
return static_cast<int>(ONNXDataType::FLOAT16);
case framework::proto::VarType::FP32:
return static_cast<int>(ONNXDataType::FLOAT);
case framework::proto::VarType::FP64:
return static_cast<int>(ONNXDataType::DOUBLE);
case framework::proto::VarType::UINT8:
return static_cast<int>(ONNXDataType::UINT8);
case framework::proto::VarType::INT8:
return static_cast<int>(ONNXDataType::INT8);
case framework::proto::VarType::BF16:
return static_cast<int>(ONNXDataType::BFLOAT16);
case framework::proto::VarType::COMPLEX64:
return static_cast<int>(ONNXDataType::COMPLEX64);
case framework::proto::VarType::COMPLEX128:
return static_cast<int>(ONNXDataType::COMPLEX128);
default:
PADDLE_THROW(
platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
}
}
const std::string VarType2PopStr(const int type) {
auto dtype = static_cast<framework::proto::VarType::Type>(type);
switch (dtype) {
case framework::proto::VarType::UINT8:
return "UINT8";
case framework::proto::VarType::INT8:
return "INT8";
case framework::proto::VarType::INT16:
return "INT16";
case framework::proto::VarType::INT32:
return "INT32";
case framework::proto::VarType::INT64:
return "INT64";
case framework::proto::VarType::BOOL:
return "BOOL";
case framework::proto::VarType::FP64:
return "DOUBLE";
case framework::proto::VarType::FP32:
return "FLOAT";
case framework::proto::VarType::FP16:
return "FLOAT16";
default:
PADDLE_THROW(
paddle::platform::errors::Unavailable("Unsupported data type."));
}
}
Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
const int id) {
auto var_name = op_node->Op()->Input(input_name).at(id);
return GetInputVarNodeByVarName(var_name, op_node);
}
Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
const int id) {
auto var_name = op_node->Op()->Output(output_name).at(id);
return GetOutputVarNodeByVarName(var_name, op_node);
}
Node *GetInputVarNodeByVarName(const std::string &var_name,
const Node *op_node) {
for (auto *var : op_node->inputs) {
if (var->Name() == var_name) {
return var;
}
}
return nullptr;
}
Node *GetOutputVarNodeByVarName(const std::string &var_name,
const Node *op_node) {
for (auto *var : op_node->outputs) {
if (var->Name() == var_name) {
return var;
}
}
return nullptr;
}
const bool is_float_equal(float a, float b, float eps) {
return std::fabs(a - b) <= eps;
}
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
namespace paddle {
namespace platform {
namespace ipu {
using framework::ir::Graph;
using framework::ir::Node;
using framework::OpDesc;
#define REGISTER_HANDLER(name, func) \
static bool __UNUSED_##name = \
paddle::platform::ipu::RegisterHandler(#name, func)
using SymbolHandler = std::function<Node *(Graph *, Node *)>;
std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
bool RegisterHandler(const std::string &, const SymbolHandler &);
SymbolHandler GetHandler(const std::string &);
void ConnectNodes(Node *first_node, Node *next_node);
void DisConnectNodes(Node *first_node, Node *next_node);
void ClearNode(Node *node);
void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
bool override = false);
const int VarType2OnnxDtype(const int type);
const std::string VarType2PopStr(const int type);
Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
const int id = 0);
Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
const int id = 0);
Node *GetInputVarNodeByVarName(const std::string &var_name,
const Node *op_node);
Node *GetOutputVarNodeByVarName(const std::string &var_name,
const Node *op_node);
const bool is_float_equal(float a, float b, float eps = 1e-8);
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace ipu {
namespace {
Node *elementwise_op_handler(Graph *graph, Node *node,
const std::string &type) {
auto *op = node->Op();
auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
int64_t x_rank = x_shape.size();
auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
int64_t y_rank = y_shape.size();
auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
auto new_node =
CreateBaseOp(graph, node, type,
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
node->outputs);
return new_node;
} else {
auto y_new_shape = std::vector<int64_t>(x_rank, 1);
for (int i = axis; i < axis + y_rank; ++i) {
y_new_shape[i] = y_shape[i - axis];
}
auto attrs = AttributeMap{
{"value", y_new_shape},
{"dims", std::vector<int64_t>{x_rank}},
{"dtype", ONNXDataType::INT64},
};
// constant
auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
// reshape
auto new_node_reshape = CreateBaseOp(
graph, node, "popart_reshape",
{GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
// elementwise_op
auto new_node =
CreateBaseOp(graph, node, type,
{GetInputVarNode("X", node), new_node_reshape->outputs[0]},
node->outputs);
return new_node;
}
}
Node *elementwise_add_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_add");
}
Node *elementwise_sub_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_sub");
}
Node *elementwise_div_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_div");
}
Node *elementwise_mul_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_mul");
}
Node *elementwise_min_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_min");
}
Node *elementwise_max_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_max");
}
Node *elementwise_pow_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_pow");
}
Node *elementwise_mod_handler(Graph *graph, Node *node) {
return elementwise_op_handler(graph, node, "popart_mod");
}
REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
} // namespace
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace ipu {
namespace {
Node *equal_handler(Graph *graph, Node *node) {
auto new_node = CreateBaseOp(
graph, node, "popart_equal",
{GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
return new_node;
}
REGISTER_HANDLER(equal, equal_handler);
} // namespace
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace ipu {
namespace {
Node *mean_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_reducemean",
{GetInputVarNode("X", node)},
{GetOutputVarNode("Out", node)},
{
{"keepdims", int64_t{0}},
});
}
Node *pow_handler(Graph *graph, Node *node) {
auto *op = node->Op();
if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
return CreateBaseOp(
graph, node, "popart_pow",
{GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
node->outputs);
} else {
// Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
auto attrs =
MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
new_node_const->outputs[0]},
node->outputs);
}
}
Node *mul_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
// build the shape for reshape
std::vector<int64_t> reshape_shape_{};
for (int left = 0; left < x_num_col_dims; left++) {
reshape_shape_.push_back(int64_t(x_shape_[left]));
}
for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
reshape_shape_.push_back(int64_t(y_shape_[right]));
}
auto x_flatten =
CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
{}, {{"axis", int64_t(x_num_col_dims)}});
auto y_flatten =
CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
{}, {{"axis", int64_t(y_num_col_dims)}});
auto matmul =
CreateBaseOp(graph, node, "popart_matmul",
{x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
auto reshape_const = CreateConst(
graph, node, {}, {},
{{"value", reshape_shape_},
{"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
{"dtype", ONNXDataType::INT64}});
return CreateBaseOp(graph, node, "popart_reshape",
{matmul->outputs[0], reshape_const->outputs[0]},
node->outputs, {});
}
Node *matmul_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
int x_rank = x_shape.size();
std::vector<int64_t> perm;
if (x_rank == 1) {
perm = std::vector<int64_t>{0};
} else if (x_rank == 2) {
return CreateGemm(graph, node,
{GetInputVarNode("X", node), GetInputVarNode("Y", node)},
node->outputs, transpose_x, transpose_y, alpha);
} else if (x_rank == 3) {
perm = std::vector<int64_t>{0, 2, 1};
} else if (x_rank == 4) {
perm = std::vector<int64_t>{0, 1, 3, 2};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"op matmul with input rank == %d", x_rank));
}
Node *x_node = GetInputVarNode("X", node);
Node *y_node = GetInputVarNode("Y", node);
if (transpose_x) {
x_node = CreateBaseOp(graph, node, "popart_transpose",
{GetInputVarNode("X", node)}, {}, {{"perm", perm}});
x_node = x_node->outputs[0];
}
if (transpose_y) {
y_node = CreateBaseOp(graph, node, "popart_transpose",
{GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
y_node = y_node->outputs[0];
}
if (is_float_equal(alpha, 1.0)) {
auto o_node =
CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
auto const_node = CreateConst(graph, node, {}, {}, attr);
return CreateBaseOp(graph, node, "popart_mul",
{o_node->outputs[0], const_node->outputs[0]},
node->outputs);
} else {
return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
node->outputs);
}
}
Node *sum_handler(Graph *graph, Node *node) {
return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
}
Node *softmax_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
}
Node *scale_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
auto bias_after_scale_ =
BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
auto new_node_bias_var =
CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
{"dims", std::vector<int64_t>{1}},
{"dtype", ONNXDataType::FLOAT}});
new_node_bias_var = new_node_bias_var->outputs[0];
Node *new_node_scale_var = nullptr;
if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
new_node_scale_var = GetInputVarNode("ScaleTensor", node);
} else {
new_node_scale_var =
CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
{"dims", std::vector<int64_t>{1}},
{"dtype", ONNXDataType::FLOAT}});
new_node_scale_var = new_node_scale_var->outputs[0];
}
// convert to float32
auto new_node_cast =
CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
static_cast<int>(framework::proto::VarType::FP32));
Node *result = nullptr;
if (bias_after_scale_) {
auto new_node_mul =
CreateBaseOp(graph, node, "popart_mul",
{new_node_cast->outputs[0], new_node_scale_var}, {}, {});
result =
CreateBaseOp(graph, node, "popart_add",
{new_node_mul->outputs[0], new_node_bias_var}, {}, {});
} else {
auto new_node_add =
CreateBaseOp(graph, node, "popart_add",
{new_node_cast->outputs[0], new_node_bias_var}, {}, {});
result =
CreateBaseOp(graph, node, "popart_mul",
{new_node_add->outputs[0], new_node_scale_var}, {}, {});
}
auto result_after_cast =
CreateCast(graph, node, result->outputs, node->outputs,
static_cast<int>(data_type_));
return result_after_cast;
}
Node *cross_entropy2_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
framework::proto::VarType::INT32);
auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
if (label_shape_.size() == 1) {
return CreateBaseOp(graph, node, "popart_nllloss",
{GetInputVarNode("X", node), new_cast->outputs[0]},
{GetOutputVarNode("Y", node)},
{
{"ignoreIndex", ignoreIndex},
});
} else {
std::vector<int64_t> new_shape_{label_shape_[0]};
auto const_before_loss = CreateBaseOp(
graph, node, "popart_constant", {}, {},
{{"value", new_shape_},
{"dims",
std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
{"dtype", ONNXDataType::INT64}});
auto reshape_before_loss = CreateBaseOp(
graph, node, "popart_reshape",
{new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
auto nllloss = CreateBaseOp(
graph, node, "popart_nllloss",
{GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
{
{"ignoreIndex", ignoreIndex},
});
auto const_after_loss = CreateBaseOp(
graph, node, "popart_constant", {}, {},
{{"value", label_shape_},
{"dims",
std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
{"dtype", ONNXDataType::INT64}});
auto reshape_after_loss =
CreateBaseOp(graph, node, "popart_reshape",
{nllloss->outputs[0], const_after_loss->outputs[0]},
{GetOutputVarNode("Y", node)}, {});
return reshape_after_loss;
}
}
REGISTER_HANDLER(mean, mean_handler);
REGISTER_HANDLER(pow, pow_handler);
REGISTER_HANDLER(mul, mul_handler);
REGISTER_HANDLER(matmul, matmul_handler);
REGISTER_HANDLER(sum, sum_handler);
REGISTER_HANDLER(softmax, softmax_handler);
REGISTER_HANDLER(scale, scale_handler);
REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
} // namespace
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace ipu {
namespace {
Node *conv2d_handler(Graph *graph, Node *node) {
OpDesc *op = node->Op();
auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
if (pads_.size() == 2) {
pads_.push_back(pads_[0]);
pads_.push_back(pads_[1]);
}
auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
return CreateConv(
graph, node,
{
GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
GetInputVarNode("Bias", node),
},
node->outputs, dilations, group_, {}, pads, stride);
} else {
return CreateConv(
graph, node,
{
GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
},
node->outputs, dilations, group_, {}, pads, stride);
}
}
Node *batch_norm_handler(Graph *graph, Node *node) {
auto *op = node->Op();
std::vector<Node *> inputs;
inputs.push_back(GetInputVarNode("X", node));
inputs.push_back(GetInputVarNode("Scale", node));
inputs.push_back(GetInputVarNode("Bias", node));
inputs.push_back(GetInputVarNode("Mean", node));
inputs.push_back(GetInputVarNode("Variance", node));
int64_t num_outputs = 1;
std::vector<Node *> outputs;
auto is_test_type = op->GetAttrType("is_test");
bool is_test;
if (is_test_type == 0) {
// int
is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
} else {
// bool
is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
}
outputs.push_back(GetOutputVarNode("Y", node));
if (!is_test) {
outputs.push_back(GetOutputVarNode("MeanOut", node));
outputs.push_back(GetOutputVarNode("VarianceOut", node));
outputs.push_back(GetOutputVarNode("SavedMean", node));
outputs.push_back(GetOutputVarNode("SavedVariance", node));
num_outputs = 5;
}
// outputs.push_back(GetOutputVarNode("ReserveSpace", node));
auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
// data_layout
return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
{
{"momentum", momentum},
{"epsilon", epsilon},
{"num_outputs", num_outputs},
});
}
Node *pool2d_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
if (global_pooling) {
if (pooling_type == "max") {
return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
node->outputs);
} else if (pooling_type == "avg") {
return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
node->outputs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"op pool2d with unkonwn pooling_type: %s", pooling_type));
}
}
if (op->HasAttr("padding_algorithm")) {
auto padding_algorithm =
BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
if (padding_algorithm != "EXPLICIT") {
PADDLE_THROW(platform::errors::InvalidArgument(
"op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
}
}
auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
if (pads.size() == 2) {
pads.push_back(paddings[0]);
pads.push_back(paddings[1]);
}
auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
if (pooling_type == "max") {
int64_t num_outputs = 1;
auto dilations = std::vector<int64_t>{};
int64_t storage_order = 0;
return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
node->outputs, {
{"num_outputs", num_outputs},
{"kernel_shape", kernel_shape},
{"ceil_mode", ceil_mode},
{"dilations", dilations},
{"pads", pads},
{"storage_order", storage_order},
{"strides", strides},
});
} else if (pooling_type == "avg") {
int64_t count_include_pad = 0;
return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
node->outputs,
{
{"kernel_shape", kernel_shape},
{"ceil_mode", ceil_mode},
{"count_include_pad", count_include_pad},
{"pads", pads},
{"strides", strides},
});
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"op pool2d with unkonwn pooling_type: %s", pooling_type));
}
}
Node *group_norm_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
auto groups = int64_t{groups_};
auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
GetInputVarNode("Scale", node),
GetInputVarNode("Bias", node)};
std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
GetOutputVarNode("Mean", node),
GetOutputVarNode("Variance", node)};
return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
outputs_, attrs_);
}
Node *instance_norm_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
GetInputVarNode("Scale", node),
GetInputVarNode("Bias", node)};
std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
outputs_, attrs_);
}
Node *layer_norm_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
std::vector<int64_t> norm_shape_{1, 1};
for (int i = 0; i < input_shape_.size(); i++) {
if (i < begin_norm_axis_) {
norm_shape_[0] *= input_shape_[i];
} else {
norm_shape_[1] *= input_shape_[i];
}
}
auto attrs1 = AttributeMap{
{"value", norm_shape_},
{"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
{"dtype", ONNXDataType::INT64}};
auto reshape1_const =
CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
auto new_node_reshape1 = CreateBaseOp(
graph, node, "popart_reshape",
{GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
int64_t groups_ = 1;
auto groupnorm_attrs_ =
AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
auto out_Y_ = MakeVarNode(graph, node);
CreateBaseOp(graph, node, "popart_groupnormalization_v2",
{new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
GetInputVarNode("Bias", node)},
{out_Y_, GetOutputVarNode("Mean", node),
GetOutputVarNode("Variance", node)},
groupnorm_attrs_);
auto attrs2 = AttributeMap{
{"value", input_shape_},
{"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
{"dtype", ONNXDataType::INT64}};
auto reshape2_const =
CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
{out_Y_, reshape2_const->outputs[0]},
{GetOutputVarNode("Y", node)}, {});
return new_node_reshape2;
}
Node *dropout_handler(Graph *graph, Node *node) {
auto *op = node->Op();
auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
auto dropout_implementation_ =
BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
auto is_test_type_ = op->GetAttrType("is_test");
bool is_test_;
if (is_test_type_ == 0) {
// int
is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
} else {
// bool
is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
}
if (is_test_) {
if (dropout_implementation_ == "upscale_in_train") {
return CreateBaseOp(graph, node, "popart_identity",
{GetInputVarNode("X", node)},
{GetOutputVarNode("Out", node)}, {});
} else if (dropout_implementation_ == "downgrade_in_infer") {
auto scale =
CreateConst(graph, node, {}, {},
{{"value", std::vector<float>{1 - dropout_prob_}},
{"dims", std::vector<int64_t>{1}},
{"dtype", ONNXDataType::FLOAT}});
return CreateBaseOp(graph, node, "popart_mul",
{GetInputVarNode("X", node), scale->outputs[0]},
{GetOutputVarNode("Out", node)}, {});
} else {
PADDLE_THROW(
platform::errors::InvalidArgument("Invalid dropout_implementation"));
}
} else {
if (dropout_implementation_ == "upscale_in_train") {
auto attrs_ =
AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
return CreateBaseOp(graph, node, "popart_dropout",
{GetInputVarNode("X", node)},
{GetOutputVarNode("Out", node)}, attrs_);
} else if (dropout_implementation_ == "downgrade_in_infer") {
PADDLE_THROW(platform::errors::InvalidArgument(
"Do not support downgrade_in_infer with training"));
} else {
PADDLE_THROW(
platform::errors::InvalidArgument("Invalid dropout_implementation"));
}
}
}
REGISTER_HANDLER(pool2d, pool2d_handler);
REGISTER_HANDLER(batch_norm, batch_norm_handler);
REGISTER_HANDLER(group_norm, group_norm_handler);
REGISTER_HANDLER(instance_norm, instance_norm_handler);
REGISTER_HANDLER(layer_norm, layer_norm_handler);
REGISTER_HANDLER(conv2d, conv2d_handler);
REGISTER_HANDLER(dropout, dropout_handler);
} // namespace
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
namespace paddle {
namespace platform {
namespace ipu {
// singleton
static int var_count = 0;
static int op_count = 0;
const std::string GenerateVarName() {
return std::string("_gen_var_") + std::to_string(var_count++);
}
const std::string GenerateOpName() {
return std::string("_gen_op_") + std::to_string(op_count++);
}
const std::string CreateOpIdentifyId(Node *node) {
// format: op_type|out_var0|out_var1|...|_gen_*
// this name will be used as op name when exporting onnx model from popart
auto op_type = node->Name();
std::string op_out = "";
for (auto *out_node : node->outputs) {
op_out += "|";
op_out += out_node->Name();
}
return {op_type + op_out + "|" + GenerateOpName()};
}
Node *MakeVarNode(Graph *graph, Node *node) {
auto var_name = GenerateVarName();
auto var_desc = std::make_unique<framework::VarDesc>(var_name);
auto var = graph->CreateVarNode(var_desc.get());
return var;
}
Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs) {
auto op_desc = std::make_unique<framework::OpDesc>();
op_desc->SetType(type);
auto op = graph->CreateOpNode(op_desc.get());
for (auto *in : inputs) {
ConnectNodes(in, op);
}
if (outputs.empty()) {
auto var = MakeVarNode(graph, node);
ConnectNodes(op, var);
} else {
for (auto *out : outputs) {
ConnectNodes(op, out);
}
}
// i/o
std::vector<std::string> input_names;
for (auto node : op->inputs) {
input_names.push_back(node->Name());
}
op->Op()->SetInput("__inputs__", input_names);
std::vector<std::string> output_names;
for (auto node : op->outputs) {
output_names.push_back(node->Name());
}
op->Op()->SetOutput("__outputs__", output_names);
op->Op()->Flush();
return op;
}
Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const AttributeMap &attrs) {
auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
if (!attrs.empty()) {
new_node->Op()->SetAttrMap(attrs);
}
// deal special attr
if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
}
if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
}
{
new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
new_node->Op()->Flush();
}
return new_node;
}
Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const AttributeMap &attrs) {
return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
}
Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, const int otype) {
auto to = VarType2PopStr(otype);
return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
{{"to", to}});
}
Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, int64_t transA,
int64_t transB, float alpha, float beta) {
return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
{
{"alpha", alpha},
{"beta", beta},
{"transA", transA},
{"transB", transB},
});
}
Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const std::vector<int64_t> &oshape) {
auto attr = AttributeMap{
{"value", oshape},
{"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
{"dtype", ONNXDataType::INT64}};
auto new_node_const =
CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
auto new_node_reshape =
CreateBaseOp(graph, node, "popart_reshape",
{inputs[0], new_node_const->outputs[0]}, outputs);
return new_node_reshape;
}
Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const std::vector<int64_t> &dilations, int64_t group,
const std::vector<int64_t> &kernel_shape,
const std::vector<int64_t> &pads,
const std::vector<int64_t> &strides) {
auto attrs = AttributeMap{
{"dilations", dilations}, {"group", group},
{"kernel_shape", kernel_shape}, {"pads", pads},
{"strides", strides},
};
return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
}
Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, int64_t axis) {
PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
"Softmax op only support one input"));
auto x_shape = inputs[0]->Var()->GetShape();
int x_rank = x_shape.size();
if (axis < 0) {
axis = axis + x_rank;
}
if (axis == x_rank - 1) {
return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
{{"axis", int64_t{-1}}});
} else {
auto perm = std::vector<int64_t>(x_rank);
std::iota(perm.begin(), perm.end(), 0);
perm[x_rank - 1] = axis;
perm[axis] = x_rank - 1;
auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
inputs, {}, {{"perm", perm}});
auto new_softmax =
CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
{}, {{"axis", int64_t{-1}}});
return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
outputs, {{"perm", perm}});
}
}
} // namespace ipu
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/ipu/common.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
namespace paddle {
namespace platform {
namespace ipu {
using paddle::framework::AttributeMap;
template <typename T>
AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
int dtype) {
return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
}
template <typename T>
AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
int dtype) {
size_t size = 1;
for (auto &dim : dims) {
size *= dim;
}
return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
}
const std::string GenerateVarName();
const std::string CreateOpIdentifyId(Node *node);
Node *MakeVarNode(Graph *graph, Node *node);
Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs);
Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const AttributeMap &attrs = {});
Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const AttributeMap &attrs);
// otype is proto::VarType::Type
Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, const int otype);
Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, int64_t transA = 0,
int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const std::vector<int64_t> &oshape);
Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs,
const std::vector<int64_t> &dilations = {1, 1},
int64_t group = 1,
const std::vector<int64_t> &kernel_shape = {},
const std::vector<int64_t> &pads = {0, 0, 0, 0},
const std::vector<int64_t> &strides = {1, 1});
Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
const std::vector<Node *> &inputs,
const std::vector<Node *> &outputs, int64_t axis);
} // namespace ipu
} // namespace platform
} // namespace paddle
......@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
// clang-format on
......@@ -16,6 +16,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#endif
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
"your train process set the correct device id if you use Executor.",
"with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
......@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(
platform::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (platform::is_ipu_place(p)) {
#ifdef PADDLE_WITH_IPU
EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
#else
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
#endif
} else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
int id = place.GetDeviceId();
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetInstance();
device_ = ipu_backend->GetDevice(id);
}
Place IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */
}
IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() {
context_ = xpu::create_context();
......
......@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
......@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
return boost::apply_visitor(IsNPUPlace(), p);
}
bool is_ipu_place(const Place &p) {
return boost::apply_visitor(IsIPUPlace(), p);
}
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
......@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
} else if (is_npu_place(p1)) {
return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
} else if (is_ipu_place(p1)) {
return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
} else {
return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
}
......
......@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
inline bool operator!=(const NPUPinnedPlace &) const { return false; }
inline bool operator<(const NPUPinnedPlace &) const { return false; }
};
struct IPUPlace {
IPUPlace() : IPUPlace(0) {}
explicit IPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const IPUPlace &o) const { return device == o.device; }
inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
inline bool operator<(const IPUPlace &o) const { return device < o.device; }
int device;
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
......@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return true; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return true; }
};
struct IsIPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
};
class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace, NPUPinnedPlace> {
CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
private:
using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace, NPUPinnedPlace>;
CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;
public:
Place() = default;
Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT
Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT
Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT
Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {} // NOLINT
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {}
......@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &);
bool is_npu_place(const Place &);
bool is_ipu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool is_npu_pinned_place(const Place &);
......@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const IPUPlace &ipu) const {
#ifdef PADDLE_WITH_IPU
return visitor_(ipu);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with IPU. Cannot visit ipu device"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
pybind.h
op_function_impl.h
\ No newline at end of file
op_function_impl.h
eager_op_function_impl.h
......@@ -130,6 +130,10 @@ limitations under the License. */
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#include "paddle/fluid/platform/ipu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
......@@ -199,6 +203,14 @@ bool IsCompiledWithNPU() {
#endif
}
bool IsCompiledWithIPU() {
#ifndef PADDLE_WITH_IPU
return false;
#else
return true;
#endif
}
bool IsCompiledWithMKLDNN() {
#ifndef PADDLE_WITH_MKLDNN
return false;
......@@ -812,6 +824,8 @@ PYBIND11_MODULE(core_noavx, m) {
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
R"DOC(
......@@ -819,7 +833,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
......@@ -1909,6 +1923,58 @@ All parameter, weight, gradient are variables in Paddle.
[](const platform::NPUPlace &self) { return self.GetDeviceId(); })
.def("__str__", string::to_string<const platform::NPUPlace &>);
// IPUPlace
py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
IPUPlace is a descriptor of a device.
It represents a IPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: ipu
ipu_place = paddle.IPUPlace()
)DOC")
.def("__init__",
[](platform::IPUPlace &self) {
#ifdef PADDLE_WITH_IPU
if (platform::GetIPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use IPU because there is no IPU "
"detected on your "
"machine.";
std::exit(-1);
}
// use ipu(0) to comile, while run with the number user configure
// in sharding and pipline.
new (&self) platform::IPUPlace(0);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use IPU because you didn't install IPU version "
"PaddlePaddle.\n"
"If you want to use IPU, please try to install IPU version "
"PaddlePaddle by: pip install paddlepaddle*\n"
"If you only have CPU, please change IPUPlace to be "
"CPUPlace().\n");
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
.def("_equals",
&IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
#ifdef PADDLE_WITH_IPU
.def("get_device_id",
[](const platform::IPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::IPUPlace &>);
py::class_<platform::Place> platformplace(m, "Place");
g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
platformplace.def(py::init<>())
......@@ -1918,6 +1984,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
......@@ -1927,6 +1994,8 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_ipu_place",
[](platform::Place &self) { return platform::is_ipu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
......@@ -1943,6 +2012,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) {
return BOOST_GET_CONST(platform::NPUPlace, self).device;
})
.def("ipu_device_id",
[](platform::Place &self) {
return BOOST_GET_CONST(platform::IPUPlace, self).device;
})
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
......@@ -1966,6 +2039,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("set_place",
[](platform::Place &self, const platform::IPUPlace &ipu_place) {
self = ipu_place;
})
.def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>);
......@@ -2197,6 +2274,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_ipu", IsCompiledWithIPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_cinn", IsCompiledWithCINN);
......@@ -2516,6 +2594,10 @@ All parameter, weight, gradient are variables in Paddle.
});
#endif
#ifdef PADDLE_WITH_IPU
m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
#endif
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
......@@ -2593,6 +2675,11 @@ All parameter, weight, gradient are variables in Paddle.
bool val) { self.Set<bool>(name, new bool(val)); })
.def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); })
.def("set",
[](ir::Pass &self, const std::string &name,
std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name,
std::unordered_set<std::string> set) {
......@@ -3425,6 +3512,118 @@ All parameter, weight, gradient are variables in Paddle.
})
.def("device_count", &ParallelExecutor::DeviceCount);
#ifdef PADDLE_WITH_IPU
py::class_<platform::ipu::IpuBackend,
std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
.def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
.def("clear", &platform::ipu::IpuBackend::Clear)
.def("set_scope", &platform::ipu::IpuBackend::SetScope)
.def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
.def(py::init())
.def_property(
"num_ipus",
[](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
[](platform::ipu::IpuStrategy &self, int num_ipus) {
self.num_ipus = num_ipus;
},
R"DOC(
Int type, set the number ipu we need. Default 1.
)DOC")
.def_property(
"accumulationFactor",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.accumulationFactor;
},
[](platform::ipu::IpuStrategy &self, int accumulationFactor) {
self.popart_options_.accumulationFactor = accumulationFactor;
},
R"DOC(
Specify the number of micro-batches to accumulate before
applying the varUpdate. Default 1.
)DOC")
.def_property("batches_per_step",
[](const platform::ipu::IpuStrategy &self) {
return self.batches_per_step;
},
[](platform::ipu::IpuStrategy &self, int batches_per_step) {
self.batches_per_step = batches_per_step;
},
R"DOC(
Int type, set batches_per_step. Default 1.
)DOC")
.def_property("is_training",
[](const platform::ipu::IpuStrategy &self) {
return self.is_training;
},
[](platform::ipu::IpuStrategy &self, bool is_training) {
self.is_training = is_training;
},
R"DOC(
Bool type, True for training, False inference. Default True.
)DOC")
.def_property(
"enable_pipelining",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.enablePipelining;
},
[](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
self.popart_options_.enablePipelining = enable_pipelining;
},
R"DOC(
Bool type, True enable pipeline, otherwise disable. Default False.
)DOC")
.def_property(
"enable_manual_shard",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.virtualGraphMode ==
platform::ipu::VirtualGraphMode::Manual;
},
[](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
if (enable_ipu_shard) {
self.popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Manual;
} else {
self.popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Off;
}
},
R"DOC(
Bool type, True enable model sharding, otherwise disable. Default "
"False.
)DOC")
.def_property("need_avg_shard",
[](const platform::ipu::IpuStrategy &self) {
return self.need_avg_shard;
},
[](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
self.need_avg_shard = need_avg_shard;
},
R"DOC(
Bool type, True enable avg shard, otherwise disable. Default False.
)DOC")
.def_property("batch_size",
[](const platform::ipu::IpuStrategy &self) {
return self.batch_size;
},
[](platform::ipu::IpuStrategy &self, int batch_size) {
self.batch_size = batch_size;
},
R"DOC(
Int type, used to make batch size fixed. Default 1.
)DOC")
.def_property("enable_fp16",
[](const platform::ipu::IpuStrategy &self) {
return self.enable_fp16;
},
[](platform::ipu::IpuStrategy &self, bool enable_fp16) {
self.enable_fp16 = enable_fp16;
},
R"DOC(
Bool type, True enable float16 mode, otherwise disable. Default False.)DOC");
#endif
BindFleetWrapper(&m);
BindIO(&m);
......
......@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing");
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
namespace paddle {
namespace pybind {
......
......@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (paddle::platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
if (zero_copy) {
auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
auto type = framework::ToDataType(std::type_index(typeid(T)));
self->ResetHolderWithType(holder, type);
} else {
auto dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), array.nbytes());
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
set(api_header_file_tmp ${api_header_file}.tmp)
set(api_source_file_tmp ${api_source_file}.tmp)
if (NOT PYTHON_EXECUTABLE)
find_package(PythonInterp REQUIRED)
endif()
add_custom_command(
OUTPUT ${api_header_file} ${api_source_file}
COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/core/kernel_registry.h"
// TODO(chenweihang) After the kernel is split into a single file,
// the kernel declare statement is automatically generated according to the
// file name of the kernel, and this header file will be removed
PT_DECLARE_KERNEL(full_like, CPU);
PT_DECLARE_KERNEL(dot, CPU);
PT_DECLARE_KERNEL(flatten, CPU);
PT_DECLARE_KERNEL(sign, CPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_KERNEL(full_like, CUDA);
PT_DECLARE_KERNEL(dot, CUDA);
PT_DECLARE_KERNEL(flatten, CUDA);
PT_DECLARE_KERNEL(sign, CUDA);
#endif
#ifdef PADDLE_WITH_XPU
PT_DECLARE_KERNEL(flatten, XPU);
#endif
......@@ -25,10 +25,14 @@ limitations under the License. */
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/infermeta.h"
PT_DECLARE_MODULE(UtilsCPU);
PT_DECLARE_KERNEL(copy, CPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(UtilsCUDA);
PT_DECLARE_KERNEL(copy, CUDA);
#endif
#ifdef PADDLE_WITH_XPU
PT_DECLARE_KERNEL(copy, XPU);
#endif
namespace paddle {
......
......@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
{"fill_any_like", "full_like"},
{"fill_constant", "full"},
{"flatten_contiguous_range", "flatten"},
// {"matmul_v2", "matmul"},
{"matmul_v2", "matmul"},
{"reduce_mean", "mean"},
{"reduce_sum", "sum"},
{"reshape2", "reshape"},
// fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
{"flatten", "deprecated"},
// {"matmul", "deprecated"},
{"matmul", "deprecated"},
{"mean", "deprecated"},
{"reshape", "deprecated"},
{"sum", "deprecated"}};
......
......@@ -265,12 +265,8 @@ class KernelFactory {
KernelMap& kernels() { return kernels_; }
void InsertCompatibleOpType(const std::string& op_type) {
compatible_op_types_.insert(op_type);
}
bool HasCompatiblePtenKernel(const std::string& op_type) const {
return compatible_op_types_.count(TransToPtenKernelName(op_type)) > 0;
return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
}
const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
......@@ -288,9 +284,6 @@ class KernelFactory {
KernelFactory() = default;
KernelMap kernels_;
// Used to be compatible with the original execution system and
// quickly confirm whether the new kernel can be called
std::unordered_set<std::string> compatible_op_types_;
};
/** operator << overload **/
......
......@@ -15,6 +15,7 @@
#pragma once
#include <cstring>
#include <string>
#include <type_traits>
#include <typeindex>
#include <typeinfo>
......@@ -24,6 +25,8 @@
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/pten/core/kernel_utils.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
#define BACKEND(arg__) pten::Backend::arg__
......@@ -140,7 +143,6 @@ struct KernelRegistrar {
Kernel kernel(kernel_fn);
args_parse_fn(kernel_key, kernel.mutable_args_def());
args_def_fn(&kernel);
KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
}
};
......@@ -193,64 +195,35 @@ struct KernelRegistrar {
#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
/** PT_REGISTER_KERNEL
*
* The most frequently used kernel registration macro, used for kernel
* registration with only data type as template parameter, and the function
* pointer of the corresponding data type is automatically instantiated
* during registration.
*/
#define PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
_PT_REGISTER_KERNEL(kernel_name, \
PT_ID, \
backend, \
layout, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
pt_register_kernel_ns_check_##kernel_name, \
"PT_REGISTER_KERNEL must be called in global namespace."); \
_PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
#ifndef _WIN32
#define _PT_REGISTER_KERNEL( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
func_id, \
backend, \
layout, \
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#define _PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \
static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
backend, \
layout, \
&__PT_KERNEL_args_def_FN_##kernel_name, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
#else
#define _PT_REGISTER_KERNEL( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
func_id, \
backend, \
layout, \
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#endif
#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
_PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \
(meta_kernel_fn, cpp_dtype, __VA_ARGS__)
/**
* `template decltype(fn) fn` can work on gcc and clang,
* but msvc will failed, error like:
......@@ -261,8 +234,30 @@ struct KernelRegistrar {
*
* https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
*
* So we solve the explict instantiation of kernel by CMake
* And msvc can work without template instantiation
*/
#define _PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
backend, \
layout, \
&__PT_KERNEL_args_def_FN_##kernel_name, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
#endif
#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
_PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \
(meta_kernel_fn, cpp_dtype, __VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
......@@ -309,22 +304,15 @@ struct KernelRegistrar {
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
#define PT_KERNEL_REGISTRAR_INIT(kernel_name, \
func_id, \
backend, \
layout, \
args_def_fn, \
meta_kernel_fn, \
cpp_dtype, \
...) \
_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
kernel_name, \
func_id, \
backend, \
layout, \
args_def_fn, \
meta_kernel_fn, \
cpp_dtype, \
#define PT_KERNEL_REGISTRAR_INIT( \
kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
kernel_name, \
backend, \
layout, \
args_def_fn, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
// clang-format off
......@@ -333,7 +321,6 @@ struct KernelRegistrar {
and multi-line macros cannot be skipped with NOLINT.*/
#define _PT_KERNEL_REGISTRAR_INIT(N, \
kernel_name, \
func_id, \
backend, \
layout, \
args_def_fn, \
......@@ -342,7 +329,6 @@ struct KernelRegistrar {
...) \
PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -354,7 +340,6 @@ struct KernelRegistrar {
// clang-format on
#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -363,17 +348,17 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::pten::KernelArgsParseFunctor<decltype( \
&meta_kernel_fn<cpp_dtype>)>::Parse, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>));
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -382,8 +367,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -392,7 +377,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -400,7 +384,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -409,8 +392,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -419,7 +402,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -427,7 +409,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -436,8 +417,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -446,7 +427,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -454,7 +434,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -463,8 +442,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -473,7 +452,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -481,7 +459,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -490,8 +467,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -500,7 +477,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -508,7 +484,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -517,8 +492,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -527,7 +502,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -535,7 +509,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -544,8 +517,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -554,7 +527,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -562,7 +534,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -571,8 +542,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -581,7 +552,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -589,7 +559,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -598,8 +567,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -608,7 +577,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -616,7 +584,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -625,8 +592,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -635,7 +602,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -643,7 +609,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -652,8 +617,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -662,7 +627,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -670,7 +634,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -679,8 +642,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -689,7 +652,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -697,7 +659,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -706,8 +667,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -716,7 +677,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -724,7 +684,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name, \
func_id, \
registrar_id, \
backend, \
layout, \
......@@ -733,8 +692,8 @@ struct KernelRegistrar {
cpp_dtype, \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_op_kernel_##func_id##_, registrar_id)( \
kernel_name, \
__reg_pt_kernel_##kernel_name##_, registrar_id)( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
......@@ -743,7 +702,6 @@ struct KernelRegistrar {
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
func_id, \
PT_ID, \
backend, \
layout, \
......@@ -751,90 +709,59 @@ struct KernelRegistrar {
meta_kernel_fn, \
__VA_ARGS__))
#define PT_REGISTER_KERNEL_STANDARD( \
kernel_name, backend, layout, dtype, kernel_fn) \
_PT_REGISTER_KERNEL_STANDARD( \
kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
#define _PT_REGISTER_KERNEL_STANDARD( \
kernel_name, func_id, backend, layout, dtype, kernel_fn) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
template decltype(kernel_fn) kernel_fn; \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
func_id)( \
kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
DATATYPE(dtype), \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
args_def_fn, \
PT_KERNEL(kernel_fn)); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
// use to declare symbol
#define PT_REGISTER_MODULE(name) \
int RegisterSymbolsFor##name() { return 0; }
#define PT_DECLARE_MODULE(name) \
extern int RegisterSymbolsFor##name(); \
UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
// only used in cpp tests
#define PT_REGISTER_KERNEL_FOR_TEST( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
_PT_REGISTER_KERNEL_FOR_TEST(kernel_name, \
PT_ID, \
backend, \
layout, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
#define _PT_REGISTER_KERNEL_FOR_TEST( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \
func_id)(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT( \
kernel_name, \
func_id, \
backend, \
layout, \
&PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#define PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, backend, layout, meta_kernel_fn) \
_PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, PT_ID, backend, layout, meta_kernel_fn)
#define _PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, func_id, backend, layout, meta_kernel_fn) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
decltype(meta_kernel_fn) meta_kernel_fn; \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
func_id)( \
kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
PT_KERNEL(meta_kernel_fn)); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
/** PT_REGISTER_SINGLE_KERNEL
*
* Used to register a single kernel, pass in the complete function pointer
* of the kernel, this registration macro will not do automatic template
* instantiation.
*/
#define PT_REGISTER_SINGLE_KERNEL( \
kernel_name, backend, layout, dtype, kernel_fn) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
pt_register_single_kernel_ns_check_##kernel_name, \
"PT_REGISTER_SINGLE_KERNEL must be called in global namespace."); \
static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
DATATYPE(dtype), \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
args_def_fn, \
PT_KERNEL(kernel_fn)); \
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \
void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
/** PT_REGISTER_KERNEL_ALL_DTYPE
*
* Used to register a kernel that supports all data types, such as copy and
* reshape that are not sensitive to data types.
*/
#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
pt_register_kernel_all_dtype_ns_check_##kernel_name, \
"PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace."); \
static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name( \
::pten::Kernel*); \
static const ::pten::KernelRegistrar \
__reg_pt_kernel_all_dtype_##kernel_name( \
#kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
&__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name, \
PT_KERNEL(kernel_fn)); \
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \
void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
/** PT_DECLARE_KERNEL
*
* Used to export the symbols of the file where the kernel is located,
* to avoid being removed by linker
*/
#define PT_DECLARE_KERNEL(kernel_name, backend) \
extern int TouchKernelSymbolFor_##kernel_name##_##backend(); \
UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
TouchKernelSymbolFor_##kernel_name##_##backend()
} // namespace pten
......@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
} // namespace pten
PT_REGISTER_MODULE(CreationCPU);
PT_REGISTER_KERNEL("full_like",
PT_REGISTER_KERNEL(full_like,
CPU,
ANY,
pten::FillAnyLike,
......@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
bool,
paddle::platform::float16) {}
PT_REGISTER_KERNEL("full",
PT_REGISTER_KERNEL(full,
CPU,
ANY,
pten::FillConstant,
......
......@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
} // namespace pten
PT_REGISTER_MODULE(LinalgCPU);
using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>;
PT_REGISTER_KERNEL("dot",
PT_REGISTER_KERNEL(dot,
CPU,
ANY,
pten::Dot,
......@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
complex128) {}
PT_REGISTER_KERNEL(
"matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
}
matmul, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {}
......@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(ManipulationCPU);
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL("flatten",
PT_REGISTER_KERNEL(flatten,
CPU,
ANY,
pten::Flatten,
......@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
int8_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("flatten.mid",
PT_REGISTER_KERNEL(flatten_mid,
CPU,
ANY,
pten::FlattenWithXShape,
......@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
int8_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("cast",
PT_REGISTER_KERNEL(cast,
CPU,
ANY,
pten::Cast,
......@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
// TODO(yuanrisheng): "reshape2" is compatible with old kernel
// architecture, kernel_name should be "reshape".
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
CPU,
ANY,
pten::ReshapeFromVectorVal) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
CPU,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {}
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
CPU,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
CPU,
ANY,
pten::ReshapeFromDTWithXShape) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
CPU,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
CPU,
ANY,
pten::ReshapeFromVectorDT) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
CPU,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
CPU,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
CPU,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
......@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(MathCPU);
using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::paddle::platform::bfloat16;
PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {}
PT_REGISTER_KERNEL("scale",
PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {}
PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {}
PT_REGISTER_KERNEL(scale,
CPU,
ANY,
pten::Scale,
......@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
int16_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("add",
PT_REGISTER_KERNEL(add,
CPU,
ANY,
pten::ElementwiseAdd,
......@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL("subtract",
PT_REGISTER_KERNEL(subtract,
CPU,
ANY,
pten::ElementwiseSub,
......@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL("divide",
PT_REGISTER_KERNEL(divide,
CPU,
ANY,
pten::ElementwiseDiv,
......@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL("multiply",
PT_REGISTER_KERNEL(multiply,
CPU,
ANY,
pten::ElementwiseMul,
......@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
bool,
complex64,
complex128) {}
PT_REGISTER_KERNEL("sum",
PT_REGISTER_KERNEL(sum,
CPU,
ANY,
pten::Sum,
......
......@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(UtilsCPU);
PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {}
......@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
} // namespace pten
PT_REGISTER_MODULE(CreationCUDA);
PT_REGISTER_KERNEL("full_like",
PT_REGISTER_KERNEL(full_like,
CUDA,
ANY,
pten::FillAnyLike,
......@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
bool,
paddle::platform::float16) {}
PT_REGISTER_KERNEL("full",
PT_REGISTER_KERNEL(full,
CUDA,
ANY,
pten::FillConstant,
......
......@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
} // namespace pten
PT_REGISTER_MODULE(LinalgCUDA);
using float16 = paddle::platform::float16;
using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>;
PT_REGISTER_KERNEL("dot",
PT_REGISTER_KERNEL(dot,
CUDA,
ANY,
pten::Dot,
......@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
complex64,
complex128) {}
PT_REGISTER_KERNEL("matmul_v2",
PT_REGISTER_KERNEL(matmul,
CUDA,
ANY,
pten::Matmul,
......
......@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(ManipulationCUDA);
using float16 = paddle::platform::float16;
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL("flatten",
PT_REGISTER_KERNEL(flatten,
CUDA,
ANY,
pten::Flatten,
......@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
int8_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("flatten.mid",
PT_REGISTER_KERNEL(flatten_mid,
CUDA,
ANY,
pten::FlattenWithXShape,
......@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
int64_t) {}
#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
PT_REGISTER_KERNEL("cast", \
PT_REGISTER_KERNEL(cast, \
CUDA, \
ANY, \
pten::Cast, \
......@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
#endif
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
CUDA,
ANY,
pten::ReshapeFromVectorVal) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
CUDA,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host",
CUDA,
ANY,
pten::ReshapeFromDT) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {}
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
CUDA,
ANY,
pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
CUDA,
ANY,
pten::ReshapeFromDTWithXShape) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
CUDA,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
CUDA,
ANY,
pten::ReshapeFromVectorDT) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
CUDA,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
CUDA,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
CUDA,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
......@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(MathCUDA);
using float16 = paddle::platform::float16;
using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>;
PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {}
PT_REGISTER_KERNEL("scale",
PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {}
PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {}
PT_REGISTER_KERNEL(scale,
CUDA,
ANY,
pten::Scale,
......@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
int16_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("add",
PT_REGISTER_KERNEL(add,
CUDA,
ANY,
pten::ElementwiseAdd,
......@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL("subtract",
PT_REGISTER_KERNEL(subtract,
CUDA,
ANY,
pten::ElementwiseSub,
......@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL("divide",
PT_REGISTER_KERNEL(divide,
CUDA,
ANY,
pten::ElementwiseDiv,
......@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL("multiply",
PT_REGISTER_KERNEL(multiply,
CUDA,
ANY,
pten::ElementwiseMul,
......@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL("sum",
PT_REGISTER_KERNEL(sum,
CUDA,
ANY,
pten::Sum,
......
......@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
}
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(UtilsCUDA);
PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {}
......@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
}
}
void TensorCopy(const DenseTensor& src, DenseTensor* dst) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
const paddle::platform::CUDADeviceContext* dev_ctx;
if (paddle::platform::is_gpu_place(dst->place()) ||
paddle::platform::is_npu_place(dst->place())) {
dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(dst->place()));
} else {
dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(src.place()));
}
pten::Copy(*dev_ctx, src, false, dst);
}
template <typename Tx,
typename Ty,
template <typename, typename> class ReduceOp>
......@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
if (config.reduce_num == 1) {
auto out_dims = y->dims();
if (x.dtype() == y->dtype()) {
pten::Copy(*dev_ctx, x, true, y);
TensorCopy(x, y);
y->Resize(out_dims);
} else {
PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {
......
......@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(ManipulationXPU);
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL("flatten_contiguous_range",
PT_REGISTER_KERNEL(flatten,
XPU,
ANY,
pten::Flatten,
......@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
int,
int64_t) {}
PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
PT_REGISTER_KERNEL(flatten_mid,
XPU,
ANY,
pten::FlattenWithXShape,
......@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
int,
int64_t) {}
// TODO(yuanrisheng): "reshape2" is compatible with old kernel
// architecture, kernel_name should be "reshape".
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
XPU,
ANY,
pten::ReshapeFromVectorVal) {}
PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {}
......@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
} // namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE(UtilsXPU);
PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {}
......@@ -21,12 +21,6 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
PT_DECLARE_MODULE(ManipulationCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(ManipulationCUDA);
#endif
namespace paddle {
namespace tests {
......
......@@ -156,6 +156,9 @@ from .tensor.manipulation import roll # noqa: F401
from .tensor.manipulation import chunk # noqa: F401
from .tensor.manipulation import tolist # noqa: F401
from .tensor.manipulation import tensordot # noqa: F401
from .tensor.manipulation import as_complex # noqa: F401
from .tensor.manipulation import as_real # noqa: F401
from .tensor.math import abs # noqa: F401
from .tensor.math import acos # noqa: F401
from .tensor.math import asin # noqa: F401
......@@ -227,6 +230,8 @@ from .tensor.math import lgamma # noqa: F401
from .tensor.math import lerp # noqa: F401
from .tensor.math import rad2deg # noqa: F401
from .tensor.math import deg2rad # noqa: F401
from .tensor.math import gcd # noqa: F401
from .tensor.math import lcm # noqa: F401
from .tensor.math import diff # noqa: F401
from .tensor.math import angle # noqa: F401
......@@ -260,6 +265,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401
from .framework import ParamAttr # noqa: F401
from .framework import create_parameter # noqa: F401
from .framework import CPUPlace # noqa: F401
from .framework import IPUPlace # noqa: F401
from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401
......@@ -291,6 +297,7 @@ from .fluid.framework import get_flags # noqa: F401
from .fluid.framework import set_flags # noqa: F401
from .device import is_compiled_with_xpu # noqa: F401
from .device import is_compiled_with_npu # noqa: F401
from .device import is_compiled_with_ipu # noqa: F401
from .device import XPUPlace # noqa: F401
from .fluid.dygraph.base import enable_dygraph as disable_static # noqa: F401
......@@ -478,6 +485,8 @@ __all__ = [ # noqa
'atan2',
'rad2deg',
'deg2rad',
'gcd',
'lcm',
'expand',
'broadcast_to',
'ones_like',
......@@ -553,6 +562,8 @@ __all__ = [ # noqa
'einsum',
'set_flags',
'get_flags',
'as_complex',
'as_real',
'diff',
'angle',
]
......@@ -28,7 +28,9 @@ __all__ = [ # noqa
'set_device',
'get_device',
'XPUPlace',
'IPUPlace',
'is_compiled_with_xpu',
'is_compiled_with_ipu',
'is_compiled_with_cinn',
'is_compiled_with_cuda',
'is_compiled_with_rocm',
......@@ -55,6 +57,36 @@ def is_compiled_with_npu():
return core.is_compiled_with_npu()
def is_compiled_with_ipu():
"""
Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
Returns (bool): `True` if IPU is supported, otherwise `False`.
Examples:
.. code-block:: python
import paddle
support_ipu = paddle.is_compiled_with_ipu()
"""
return core.is_compiled_with_ipu()
def IPUPlace():
"""
Return a Graphcore IPU Place
Examples:
.. code-block:: python
# required: ipu
import paddle
place = paddle.device.IPUPlace()
"""
return core.IPUPlace()
def is_compiled_with_xpu():
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
......@@ -143,13 +175,19 @@ def _convert_to_place(device):
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
device_id = int(selected_npus[0])
place = core.NPUPlace(device_id)
elif lower_device == 'ipu':
if not core.is_compiled_with_ipu():
raise ValueError(
"The device should not be 'ipu', " \
"since PaddlePaddle is not compiled with IPU")
place = core.IPUPlace()
else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device)
if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
raise ValueError(
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'"
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
)
if avaliable_gpu_device:
if not core.is_compiled_with_cuda():
......@@ -183,13 +221,13 @@ def _convert_to_place(device):
def set_device(device):
"""
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU.
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
They are represented by string identifiers. This function can specify the global device
which the OP will run.
Parameters:
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``,
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
where ``x`` is the index of the GPUs, XPUs or NPUs.
Examples:
......@@ -236,5 +274,10 @@ def get_device():
elif isinstance(place, core.NPUPlace):
device_id = place.get_device_id()
device = 'npu:' + str(device_id)
elif isinstance(place, core.IPUPlace):
num_devices = core.get_ipu_device_count()
device = "ipus:{{0-{}}}".format(num_devices - 1)
else:
raise ValueError("The device specification {} is invalid".format(place))
return device
......@@ -296,6 +296,83 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
return False
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
assert len(x_dims_mapping) >= len(
y_dims_mapping), "now just support x dims > y dims"
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_replicate(out_dims_mapping[-1]):
return False
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
input_dims_mapping = []
ordered_input_shard_dims_mapping = []
for dim in (x_dims_mapping + y_dims_mapping):
input_dims_mapping.append(dim)
for item in input_dims_mapping:
if item not in ordered_input_shard_dims_mapping and item != -1:
ordered_input_shard_dims_mapping.append(item)
for mapping in out_dims_mapping:
if mapping not in input_dims_mapping:
return False
if is_dim_shard(x_dims_mapping[0]):
order_index = 0
for idx, item in enumerate(out_dims_mapping):
if item != -1:
if item != ordered_input_shard_dims_mapping[order_index]:
return False
else:
order_index += 1
if order_index != len(ordered_input_shard_dims_mapping):
return False
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
1]):
return False
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
if is_dim_shard(x_dims_mapping[0]):
for mapping in y_dims_mapping[1:]:
if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......@@ -510,6 +587,95 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
return False
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if op_desc.attr('transpose_X') or op_desc.attr('transpose_Y'):
return False
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
# for gpt2, x dims > y dims, this is a temporary solution
assert len(x_dims_mapping) >= len(
y_dims_mapping), "now just support x dims > y dims"
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_shard(out_dims_mapping[-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
if is_dim_replicate(x_dims_mapping[-1]):
return False
if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
x_shard_dim_count = 0
x_shard_dims = []
y_shard_dim_count = 0
y_shard_dims = []
for dim in x_dims_mapping:
if is_dim_shard(dim):
x_shard_dim_count += 1
x_shard_dims.append(dim)
for dim in y_dims_mapping:
if is_dim_shard(dim):
y_shard_dim_count += 1
y_shard_dims.append(dim)
if not x_shard_dims and not y_shard_dims:
return False
if x_shard_dims[-1] != y_shard_dims[0]:
return False
if x_shard_dim_count == y_shard_dim_count:
for dim in out_dims_mapping:
if is_dim_shard(dim):
return False
if x_shard_dims != y_shard_dims:
return False
else:
if x_shard_dim_count < y_shard_dim_count:
return False
output_shard_dims = []
for dim in out_dims_mapping:
if is_dim_shard(dim):
output_shard_dims.append(dim)
if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......@@ -710,6 +876,59 @@ class DistributedMatmulImpl2(DistributedOperatorImpl):
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
assert len(x_dims_mapping) >= len(
y_dims_mapping
), "now just support x dims > y dims,but x:{0} and y:{1}".format(
x_dims_mapping, y_dims_mapping)
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_shard(out_dims_mapping[-1]):
return False
if is_valid_list_index(out_dims_mapping,
-2) and is_dim_shard(out_dims_mapping[-2]):
return False
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_valid_list_index(x_dims_mapping,
-2) and is_dim_shard(x_dims_mapping[-2]):
return False
if is_dim_shard(y_dims_mapping[-1]):
return False
if is_valid_list_index(y_dims_mapping,
-2) and is_dim_shard(y_dims_mapping[-2]):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......@@ -777,6 +996,86 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
return False
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
return False
assert len(x_dims_mapping) >= len(
y_dims_mapping), "now just support x dims > y dims"
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_replicate(out_dims_mapping[-1]):
return False
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
input_dims_mapping = []
ordered_input_shard_dims_mapping = []
for dim in (x_dims_mapping + y_dims_mapping):
input_dims_mapping.append(dim)
for item in input_dims_mapping:
if item not in ordered_input_shard_dims_mapping and item != -1:
ordered_input_shard_dims_mapping.append(item)
for mapping in out_dims_mapping:
if mapping not in input_dims_mapping:
return False
if is_dim_shard(x_dims_mapping[0]):
order_index = 0
for idx, item in enumerate(out_dims_mapping):
if item != -1:
if item != ordered_input_shard_dims_mapping[order_index]:
return False
else:
order_index += 1
if order_index != len(ordered_input_shard_dims_mapping):
return False
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
1]):
return False
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
if is_dim_shard(x_dims_mapping[0]):
for mapping in y_dims_mapping[1:]:
if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......@@ -985,6 +1284,94 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
return False
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
return False
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
assert len(x_dims_mapping) >= len(
y_dims_mapping), "now just support x dims > y dims"
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_shard(out_dims_mapping[-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
if is_dim_replicate(x_dims_mapping[-1]):
return False
if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
x_shard_dim_count = 0
x_shard_dims = []
y_shard_dim_count = 0
y_shard_dims = []
for dim in x_dims_mapping:
if is_dim_shard(dim):
x_shard_dim_count += 1
x_shard_dims.append(dim)
for dim in y_dims_mapping:
if is_dim_shard(dim):
y_shard_dim_count += 1
y_shard_dims.append(dim)
if not x_shard_dims and not y_shard_dims:
return False
if x_shard_dims[-1] != y_shard_dims[0]:
return False
if x_shard_dim_count == y_shard_dim_count:
for dim in out_dims_mapping:
if is_dim_shard(dim):
return False
if x_shard_dims != y_shard_dims:
return False
else:
if x_shard_dim_count < y_shard_dim_count:
return False
output_shard_dims = []
for dim in out_dims_mapping:
if is_dim_shard(dim):
output_shard_dims.append(dim)
if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......@@ -1183,6 +1570,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
return True
def is_auto_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
assert len(x_dims_mapping) >= len(
y_dims_mapping
), "now just support x dims > y dims,but x:{0} and y:{1}".format(
x_dims_mapping, y_dims_mapping)
if len(x_dims_mapping) == len(y_dims_mapping) and len(
x_dims_mapping) == 4:
if x_dims_mapping[:2] != y_dims_mapping[:2]:
return False
if x_dims_mapping[:2] != out_dims_mapping[:2]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
elif len(x_dims_mapping) != len(y_dims_mapping) and len(
x_dims_mapping) == 3:
if x_dims_mapping[0] != out_dims_mapping[0]:
return False
x_dims_mapping = x_dims_mapping[-2:]
y_dims_mapping = y_dims_mapping[-2:]
out_dims_mapping = out_dims_mapping[-2:]
if is_dim_shard(out_dims_mapping[-1]):
return False
if is_valid_list_index(out_dims_mapping,
-2) and is_dim_shard(out_dims_mapping[-2]):
return False
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_valid_list_index(x_dims_mapping,
-2) and is_dim_shard(x_dims_mapping[-2]):
return False
if is_dim_shard(y_dims_mapping[-1]):
return False
if is_valid_list_index(y_dims_mapping,
-2) and is_dim_shard(y_dims_mapping[-2]):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
......
......@@ -27,11 +27,13 @@ from collections import OrderedDict
import paddle
import paddle.fluid as fluid
from paddle import framework
from paddle.fluid import core
import paddle.distributed as dist
from paddle.optimizer import Optimizer
from paddle.fluid.clip import ClipGradByGlobalNorm
from ...utils.internal_storage import ParamStorage
from ...meta_parallel.sharding.sharding_utils import Type
from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
# CUDA alignment 256 bytes
alignment = {"gpu": 256, }
......@@ -99,16 +101,41 @@ class ShardingOptimizerStage2(Optimizer):
self.broadcast_fp16 = broadcast_fp16
self.param_storages = {} # {dtype: {rank: InternalStorage}}
if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
logging.warning(
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
group,
paddle.get_device())
if offload:
assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
self.offload = offload # Using for offload
self.offload_device = "cpu"
self._master_params = {}
# Update optimizer parameters and adjust parameter storage and use according to rank.
self.update_opt_status()
def _generate_master_params(self, trainable_params):
for param in trainable_params:
if param.dtype == Type.fp16.value:
self._optim._master_weights[param.name] = paddle.cast(
param, Type.fp32.value)
if self.offload:
for param in trainable_params:
if param.name not in self._master_params.keys():
self._master_params[param.name] = core.VarBase(
name=param.name,
value=param.cast(dtype=Type.fp32.value).numpy(),
place=core.CPUPlace(),
stop_gradient=param.stop_gradient)
self._optim._master_weights = self._master_params
else:
for param in trainable_params:
if param.dtype == Type.fp16.value:
self._optim._master_weights[param.name] = paddle.cast(
param, Type.fp32.value)
def update_opt_status(self):
"""Update optimizer status and parameter storage information, and special functions to be developed.
......@@ -243,22 +270,43 @@ class ShardingOptimizerStage2(Optimizer):
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
# Synchronize optimizer parameters for the current rank
if len(self.dtype_rank_params.keys(
)) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
self._optim._parameter_list = self.dtype_rank_params[
Type.fp32.value][self.rank]
elif len(self.dtype_rank_params.keys(
)) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
self._optim._parameter_list = self.dtype_rank_params[
Type.fp16.value][self.rank]
if self.offload:
self._optim._parameter_list = [
param for name, param in self._master_params.items()
]
else:
self._optim._parameter_list = self.dtype_rank_params[
Type.fp16.value][self.rank] + self.dtype_rank_params[
# Synchronize optimizer parameters for the current rank
if len(self.dtype_rank_params.keys(
)) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
self._optim._parameter_list = self.dtype_rank_params[
Type.fp32.value][self.rank]
elif len(self.dtype_rank_params.keys(
)) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
self._optim._parameter_list = self.dtype_rank_params[
Type.fp16.value][self.rank]
else:
self._optim._parameter_list = self.dtype_rank_params[
Type.fp16.value][self.rank] + self.dtype_rank_params[
Type.fp32.value][self.rank]
# Run the optimizer of the current rank step
self._optim.step()
if self.offload:
with device_guard(self.rank, self.offload_device):
self._optim.step()
for param in self._optim._parameter_list:
self._master_params[param.name].set_value(param)
dev_id = 0 if paddle.get_device() == "cpu" else int(
paddle.get_device().split(":")[1])
for param in self._local_params:
if param.name in self._master_params.keys():
param.set_value(self._master_params[param.name].cuda(dev_id)
.cast(dtype=param.dtype))
self._master_params[param.name].clear_gradient(False)
else:
self._optim.step()
# Synchronize all the updated shards in between the ranks
self._broadcast_params()
......
......@@ -112,6 +112,18 @@ class ShardingStage2(nn.Layer):
self._has_grad_storage = []
self._grad_storage_list = []
# offload
# TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
self._offload_optims = list(
filter(lambda optim: optim.offload, self._sharding_optimizers))
if len(self._offload_optims) > 0:
assert len(
self._sharding_optimizers
) == 1, "Only support offload strategy for single optimizer"
self._offload = self._sharding_optimizers[0].offload
self._offload_device = "cpu"
# Set backward pass hooks
self._bw_hooks = []
......@@ -156,7 +168,8 @@ class ShardingStage2(nn.Layer):
# Release grad storages
for dtype in self._grad_storages.keys():
if self._rank in self._grad_storages[dtype].keys():
self._grad_storages[dtype][self._rank].buffer.zero_()
if not self._offload:
self._grad_storages[dtype][self._rank].buffer.zero_()
# Release params
for param in self._trainable_params:
......@@ -167,17 +180,24 @@ class ShardingStage2(nn.Layer):
"""
Before the gradient accumulation, scale the gradient.
"""
# Scale grad storages
for dtype in self._grad_storages.keys():
if self._rank in self._grad_storages[dtype].keys():
self._grad_storages[dtype][self._rank].buffer.scale_(
scale=self._world_size_scaling)
# Scale params
for param in self._trainable_params:
if param.name in self._param_grads and param.grad is not None:
param.grad.scale_(scale=self._world_size_scaling)
param._reset_grad_inplace_version(True)
if self._offload:
for param in self._trainable_params:
if param.name in self._sharding_optimizers[
0]._master_params.keys():
self._sharding_optimizers[0]._master_params[
param.name].grad.scale_(scale=self._world_size_scaling)
else:
# Scale grad storages
for dtype in self._grad_storages.keys():
if self._rank in self._grad_storages[dtype].keys():
self._grad_storages[dtype][self._rank].buffer.scale_(
scale=self._world_size_scaling)
# Scale params
for param in self._trainable_params:
if param.name in self._param_grads and param.grad is not None:
param.grad.scale_(scale=self._world_size_scaling)
param._reset_grad_inplace_version(True)
def _init_internal_storage(self, needs_fresh):
"""
......@@ -195,8 +215,14 @@ class ShardingStage2(nn.Layer):
"""
Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
"""
assert isinstance(device, str), "Device must be type str"
assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
self._layer.to(device=device, dtype=dtype, blocking=blocking)
# Re-build the buckets, hooks, etc..
self._fresh_trainable()
def _fresh_trainable(self):
""" Whether to update training parameters. """
......@@ -283,12 +309,17 @@ class ShardingStage2(nn.Layer):
self._grad_reduced[index] = False
if not self._accumulate_grads:
param.grad.scale_(scale=self._world_size_scaling)
param._reset_grad_inplace_version(True)
param._reset_grad_inplace_version(True)
# Clear the gradient that does not belong to the current rank through the callback function
def cleanup():
if dst_rank != self._rank:
param.clear_gradient(False)
elif self._offload:
self._sharding_optimizers[0]._master_params[
param.name]._copy_gradient_from(param.grad.cpu(
).cast(dtype=Type.fp32.value))
param.clear_gradient(False)
# Synchronize the reduce parameter gradient
self._tasks_flow.append(
......@@ -339,6 +370,15 @@ class ShardingStage2(nn.Layer):
grad_storage.buffer.value().get_tensor()._clear(
)
elif self._offload:
grad_storage.to(device=self._offload_device)
for param in grad_storage._params:
self._sharding_optimizers[0]._master_params[
param.name]._copy_gradient_from(
param.grad.cast(
dtype=Type.fp32.value))
grad_storage.buffer.value().get_tensor()._clear(
)
# Reduce the bucket
grad_storage.sent = True
......@@ -478,7 +518,7 @@ class ShardingStage2(nn.Layer):
# Rebuild fp16/fp32 grad storages
for dtype in self._grad_storages.keys():
for dst_rank, grad_storage in self._grad_storages[dtype].items():
if dst_rank != self._rank:
if self._offload or dst_rank != self._rank:
grad_storage.manumal_relase()
grad_storage.rebuild()
......
......@@ -17,10 +17,17 @@ import contextlib
from collections import abc
from enum import Enum
from math import inf
import numpy as np
from types import MethodType
import paddle
import paddle.distributed as dist
from paddle import _C_ops
from paddle.fluid import core
from paddle.fluid import layers
from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only
from paddle.fluid.dygraph import base as imperative_base
class Taskflow:
......@@ -41,6 +48,88 @@ class Type(Enum):
fp32 = paddle.float32
class ShardingClipGrad:
def __init__(self, clip, group, device):
self._clip = clip
self._group = group
self._device = device
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
sum_square_fp16 = []
sum_square_fp32 = []
for p, g in params_grads:
if g is None or getattr(p, 'need_clip', True) is False:
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.get_tensor_from_selected_rows(
layers.merge_selected_rows(g))
square = layers.square(merge_grad)
sum_square = layers.reduce_sum(square)
if p.dtype == paddle.float16:
sum_square_fp16.append(sum_square)
elif p.dtype == paddle.float32:
sum_square_fp32.append(sum_square)
# global norm of non-distributed FP16 params_and_grads
if len(sum_square_fp16) == 0:
global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
else:
global_norm_fp16 = layers.concat(sum_square_fp16)
global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
global_norm_fp16 = paddle.cast(
global_norm_fp16, dtype=paddle.float32)
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32 = layers.concat(sum_square_fp32) if len(
sum_square_fp32) != 0 else paddle.to_tensor(
[0.], dtype=paddle.float32)
global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
global_norm_var = global_norm_fp16 + global_norm_fp32
# add all reduce to get global norm of distributed params_and_grads
dev_id = int(self._device.split(":")[1])
with device_guard(dev_id, "gpu"):
paddle.distributed.all_reduce(global_norm_var, group=self._group)
global_norm_var = layers.sqrt(global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
clip_var = layers.elementwise_div(
x=max_global_norm,
y=layers.elementwise_max(
x=global_norm_var, y=max_global_norm))
clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
for p, g in params_grads:
if g is None:
continue
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
if p.dtype == paddle.float16:
new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
else:
new_grad = layers.elementwise_mul(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
return params_and_grads
def __getattr__(self, item):
return getattr(self._clip, item)
def __call__(self, params_grads):
return self._dygraph_clip(params_grads)
@contextlib.contextmanager
def device_guard(dev_id, device="cpu"):
origin_device = paddle.device.get_device()
......@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
yield
finally:
paddle.set_device(origin_device)
@dygraph_only
def ShardingScaler(scaler, sharding_group):
def unscale_method(self, optimizer):
if not self._enable:
return
param_grads = []
param_grads_fp16 = []
param_grads_fp32 = []
if getattr(optimizer, '_param_groups', None) and isinstance(
optimizer._param_groups[0], dict):
for group in optimizer._param_groups:
for param in group['params']:
if param._grad_ivar() is not None:
param_grads.append(param._grad_ivar())
if param._grad_ivar(
).dtype == core.VarDesc.VarType.FP16:
param_grads_fp16.append(param._grad_ivar())
else:
param_grads_fp32.append(param._grad_ivar())
else:
param_grads = [
param._grad_ivar() for param in optimizer._parameter_list
if param._grad_ivar() is not None
]
param_grads_fp16 = [
param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
)
]
param_grads_fp32 = [
param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
)
]
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
if len(param_grads_fp16):
_C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
param_grads_fp16,
temp_found_inf_fp16)
if len(param_grads_fp32):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
param_grads_fp32,
temp_found_inf_fp32)
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
paddle.distributed.all_reduce(
is_found_inf,
op=paddle.distributed.ReduceOp.MAX,
group=sharding_group)
self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler)
return scaler
......@@ -50,6 +50,29 @@ class InternalStorage:
else:
self.buffer = paddle.zeros(size, dtype=dtype)
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
assert (dtype == Type.fp32.value or
Type.fp16.value), "Conversion type is not supported now"
dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
.split(":")[1])
if self._device != device:
tmp_buffer = self.buffer.cuda(
dev_id) if device == "gpu" else self.buffer.cpu()
for param in self._params:
param.clear_gradient(False)
param._gradient_set_empty(False)
self.buffer.value().get_tensor()._clear()
self.buffer = tmp_buffer
if dtype is not None:
self.buffer = self.buffer.cast(dtype=dtype)
class ParamStorage(InternalStorage):
"""
......@@ -60,6 +83,16 @@ class ParamStorage(InternalStorage):
super().__init__(size, dtype, device, convert_cpu=True)
self.param2align = None
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
super().to(device, dtype)
if keep_alignment:
self._array_params()
@fluid.dygraph.no_grad
def add_rank_params(self, trainable_params, param2align):
"""
......@@ -78,7 +111,7 @@ class ParamStorage(InternalStorage):
p_shape = self._add_param_as_view(param, param2align[param.name])
cpu_param_shape.append(p_shape)
# buffer covert from cpu to cuda
# buffer convert from cpu to cuda
dev_id = int(paddle.get_device().split(":")[1])
self.buffer = self.buffer.cuda(dev_id)
self._fill = 0
......@@ -109,7 +142,8 @@ class ParamStorage(InternalStorage):
param.stop_gradient = origin_state
# Copy the current param value
dev_id = int(paddle.get_device().split(":")[1])
dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
.split(":")[1])
with device_guard(dev_id, "cpu"):
tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
var_end))
......@@ -134,6 +168,18 @@ class ParamStorage(InternalStorage):
self._fill = offset
@fluid.dygraph.no_grad
def _array_params(self):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
"""
assert len(self._params) > 0
assert self.param2align is not None
self._fill = 0
for p in self._params:
self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify
class GradStorage(InternalStorage):
"""
......@@ -171,6 +217,18 @@ class GradStorage(InternalStorage):
param.shape) + align <= self._max_size and id(
param) not in self._param_ids
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
if self._release:
self.rebuild()
super().to(device, dtype)
if keep_alignment:
self._array_grads()
@fluid.dygraph.no_grad
def add_grad(self, param, align):
"""
......@@ -206,17 +264,25 @@ class GradStorage(InternalStorage):
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
"""
assert len(self._params) > 0
if self._release:
self.buffer = paddle.zeros(
[self._max_size], dtype=self._params[0].dtype)
self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)
for p in self._params:
self._add_grad_as_view(p, self._parm2align[p.name])
self._release = False
@fluid.dygraph.no_grad
def _array_grads(self):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if len(self._params) > 0:
self._fill = 0
for p in self._params:
self._add_grad_as_view(p, self._parm2align[p.name])
@fluid.dygraph.no_grad
def _add_grad_as_view(self, param, align):
assert np.prod(
......@@ -229,8 +295,17 @@ class GradStorage(InternalStorage):
assert offset <= np.prod(self.buffer.shape)
# Copy the current grad value to InternalStorage
assert self._device == "gpu"
tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
param._copy_gradient_from(tmp_var)
tmp_var.value().get_tensor()._clear()
dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
.split(":")[1])
if self._device == "cpu":
with device_guard(dev_id, self._device):
tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
param._copy_gradient_from(tmp_var)
tmp_var.value().get_tensor()._clear()
elif self._device == "gpu":
tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
param._copy_gradient_from(tmp_var)
tmp_var.value().get_tensor()._clear()
self._fill = offset
......@@ -305,7 +305,8 @@ class Uniform(Distribution):
else:
output_shape = shape + batch_shape
output = nn.uniform_random(
output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
output_shape, dtype=self.dtype, min=0., max=1.,
seed=seed) * (tensor.zeros(
output_shape, dtype=self.dtype) + (self.high - self.low))
output = elementwise_add(output, self.low, name=name)
if self.all_arg_is_float:
......
......@@ -71,7 +71,7 @@ from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, Scope, _Scope
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
from .incubate import fleet
from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig
......@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'CUDAPlace',
'CUDAPinnedPlace',
'NPUPlace',
'IPUPlace',
'Tensor',
'ParamAttr',
'WeightNormParamAttr',
......@@ -197,6 +198,11 @@ def __bootstrap__():
if os.name == 'nt':
remove_flag_if_exists('cpu_deterministic')
if core.is_compiled_with_ipu():
# Currently we request all ipu available for training and testing
# finer control of pod of IPUs will be added later
read_env_flags += []
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
# Note(zhouwei25): sys may not have argv in some cases,
# Such as: use Python/C API to call Python from C++
......
......@@ -484,7 +484,7 @@ class ImperativeQuantizeOutputs(object):
model_filename=model_filename,
params_filename=params_filename))
self._gather_scales(infer_program, scope)
self._gather_scales(infer_program, scope, fetch_targets)
self._set_skip_quant_attr(infer_program)
......@@ -520,10 +520,10 @@ class ImperativeQuantizeOutputs(object):
return flag
def _gather_scales(self, program, scope):
def _gather_scales(self, program, scope, fetch_targets):
"""
Get all scales from fake ops, save them into the corresponding ops
and delete all moving_average_abs_max_scale ops.
and delete all moving_average_abs_max_scale ops.
"""
def _gather_input_scale():
......@@ -580,6 +580,11 @@ class ImperativeQuantizeOutputs(object):
for next_op in next_ops:
next_op._rename_input(out_var_name, in_var_name)
# If next_op is `fetch` and out_var_name in fetch_targets,
# fetch_targets must update to in_var_name when rename input.
for i in range(len(fetch_targets)):
if fetch_targets[i].name == out_var_name:
fetch_targets[i] = block.var(in_var_name)
_gather_input_scale()
_gather_output_scale()
......
......@@ -410,6 +410,23 @@ class PostTrainingQuantization(object):
for op_type in self._dynamic_quantize_op_type):
self._collect_dynamic_quantize_op_threshold(
self._dynamic_quantize_op_type)
# Move sub blocks persistable var to global block
global_block = self._program.global_block()
for _op in global_block.ops:
if _op.type == "while":
_block_id = _op.attr("sub_block").id
_block = self._program.block(_block_id)
persistables = []
for _name, _var in _block.vars.items():
if _var.persistable:
global_block._clone_variable(_var)
persistables.append(_name)
for _name in persistables:
_block._remove_var(_name)
persistables.extend(_op.input('X'))
_op.desc.set_input("X", persistables)
return self._program
def save_quantized_model(self,
......@@ -451,10 +468,6 @@ class PostTrainingQuantization(object):
model_filename=self._model_filename,
params_filename=self._params_filename)
if self._program.num_blocks > 1:
_logger.error("The post training quantization requires that the "
"program only has one block.")
if self._optimize_model:
self._optimize_fp32_model()
......@@ -505,23 +518,26 @@ class PostTrainingQuantization(object):
self._quantized_act_var_name.add(var_name)
persistable_var_names = _all_persistable_var_names(self._program)
for op in self._program.global_block().ops:
op_type = op.type
if self._is_full_quantize and \
op_type not in self._quantizable_op_type:
_logger.warning(op_type + " is not supported for quantization.")
# For quantized ops, sample inputs and outputs
if op_type in self._quantizable_op_type:
collect_var_name(
_get_op_input_var_names(op), persistable_var_names, op_type)
collect_var_name(
_get_op_output_var_names(op), persistable_var_names,
op_type)
# For other op, only sample output scale
elif op_type in self._out_scale_op_list:
collect_var_name(
_get_op_output_var_names(op), persistable_var_names,
op_type)
for block_id in range(len(self._program.blocks)):
for op in self._program.blocks[block_id].ops:
op_type = op.type
if self._is_full_quantize and \
op_type not in self._quantizable_op_type:
_logger.warning(op_type +
" is not supported for quantization.")
# For quantized ops, sample inputs and outputs
if op_type in self._quantizable_op_type:
collect_var_name(
_get_op_input_var_names(op), persistable_var_names,
op_type)
collect_var_name(
_get_op_output_var_names(op), persistable_var_names,
op_type)
# For other op, only sample output scale
elif op_type in self._out_scale_op_list:
collect_var_name(
_get_op_output_var_names(op), persistable_var_names,
op_type)
def _set_activation_persistable(self):
'''
......@@ -696,16 +712,17 @@ class PostTrainingQuantization(object):
'''
assert self._algo == "min_max", \
"The algo should be min_max to save input threshold."
for op in self._program.global_block().ops:
if op.type in self._quantizable_op_type:
for var_name in _get_op_input_var_names(op):
assert var_name in self._quantized_var_min
assert var_name in self._quantized_var_max
op._set_attr(var_name + ".min",
self._quantized_var_min[var_name])
op._set_attr(var_name + ".max",
self._quantized_var_max[var_name])
op._set_attr("with_quant_attr", True)
for block_id in range(len(self._program.blocks)):
for op in self._program.blocks[block_id].ops:
if op.type in self._quantizable_op_type:
for var_name in _get_op_input_var_names(op):
assert var_name in self._quantized_var_min
assert var_name in self._quantized_var_max
op._set_attr(var_name + ".min",
self._quantized_var_min[var_name])
op._set_attr(var_name + ".max",
self._quantized_var_max[var_name])
op._set_attr("with_quant_attr", True)
def _collect_activation_abs_min_max(self):
'''
......@@ -795,7 +812,12 @@ class PostTrainingQuantization(object):
activation_quantize_type=self._activation_quantize_type,
weight_quantize_type=self._weight_quantize_type,
quantizable_op_type=major_quantizable_op_types)
transform_pass.apply(graph)
for sub_graph in graph.all_sub_graphs():
# Insert fake_quant/fake_dequantize op must in test graph, so
# set per graph's _for_test is True.
sub_graph._for_test = True
transform_pass.apply(sub_graph)
# use AddQuantDequantPass to insert fake_quant_dequant op
minor_quantizable_op_types = []
......@@ -806,7 +828,10 @@ class PostTrainingQuantization(object):
scope=self._scope,
place=self._place,
quantizable_op_type=minor_quantizable_op_types)
add_quant_dequant_pass.apply(graph)
for sub_graph in graph.all_sub_graphs():
sub_graph._for_test = True
add_quant_dequant_pass.apply(sub_graph)
# save threshold to scale var node
if self._algo in ["KL", "hist"]:
......@@ -836,7 +861,11 @@ class PostTrainingQuantization(object):
activation_bits=self._activation_bits,
weight_quantize_type=self._weight_quantize_type,
quantizable_op_type=major_quantizable_op_types)
freeze_pass.apply(graph)
for sub_graph in graph.all_sub_graphs():
sub_graph._for_test = True
freeze_pass.apply(sub_graph)
self._program = graph.to_program()
def _save_output_threshold(self):
......@@ -888,13 +917,15 @@ class PostTrainingQuantization(object):
save_info(op_node, out_var_name, self._quantized_var_max,
"out_max", "post_min_max")
for op in self._program.global_block().ops:
if op.type in (self._quantizable_op_type + self._out_scale_op_list):
out_var_names = _get_op_output_var_names(op)
assert len(out_var_names) == 1, "Post training " + \
"quantization only support one output for " + op.type
for var_name in out_var_names:
analysis_and_save_info(op, var_name)
for block_id in range(len(self._program.blocks)):
for op in self._program.blocks[block_id].ops:
if op.type in (
self._quantizable_op_type + self._out_scale_op_list):
out_var_names = _get_op_output_var_names(op)
assert len(out_var_names) == 1, "Post training " + \
"quantization only support one output for " + op.type
for var_name in out_var_names:
analysis_and_save_info(op, var_name)
def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
"""
......
......@@ -139,6 +139,7 @@ endfunction()
if(WIN32)
list(REMOVE_ITEM TEST_OPS test_light_nas)
list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
......@@ -336,6 +337,7 @@ if(NOT WIN32)
set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
endif()
......
# copyright (c) 2021 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import os
import time
import sys
import random
import math
import functools
import contextlib
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.dataset.common import download
from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
paddle.enable_static()
random.seed(0)
np.random.seed(0)
class TestPostTrainingQuantization(unittest.TestCase):
def setUp(self):
self.download_path = 'int8/download'
self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
self.download_path)
self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
self.int8_model_path = os.path.join(os.getcwd(),
"post_training_" + self.timestamp)
try:
os.system("mkdir -p " + self.int8_model_path)
except Exception as e:
print("Failed to create {} due to {}".format(self.int8_model_path,
str(e)))
sys.exit(-1)
def tearDown(self):
try:
os.system("rm -rf {}".format(self.int8_model_path))
except Exception as e:
print("Failed to delete {} due to {}".format(self.int8_model_path,
str(e)))
def cache_unzipping(self, target_folder, zip_path):
cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
os.system(cmd)
def download_model(self, data_url, data_md5, folder_name):
download(data_url, self.download_path, data_md5)
file_name = data_url.split('/')[-1]
zip_path = os.path.join(self.cache_folder, file_name)
print('Data is downloaded at {0}'.format(zip_path))
data_cache_folder = os.path.join(self.cache_folder, folder_name)
self.cache_unzipping(self.cache_folder, zip_path)
return data_cache_folder
def run_program(self, model_path, batch_size, infer_iterations):
print("test model path:" + model_path)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
[infer_program, feed_dict, fetch_targets] = \
fluid.io.load_inference_model(model_path,
model_filename='model.pdmodel',
params_filename='model.pdiparams', executor=exe)
val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
img_shape = [1, 28, 28]
test_info = []
cnt = 0
periods = []
for batch_id, data in enumerate(val_reader()):
image = np.array(
[x[0].reshape(img_shape) for x in data]).astype("float32")
input_label = np.array([x[1] for x in data]).astype("int64")
t1 = time.time()
out = exe.run(infer_program,
feed={feed_dict[0]: image},
fetch_list=fetch_targets)
t2 = time.time()
period = t2 - t1
periods.append(period)
out_label = np.argmax(np.array(out[0]), axis=1)
top1_num = sum(input_label == out_label)
test_info.append(top1_num)
cnt += len(data)
if (batch_id + 1) == infer_iterations:
break
throughput = cnt / np.sum(periods)
latency = np.average(periods)
acc1 = np.sum(test_info) / cnt
return (throughput, latency, acc1)
def generate_quantized_model(self,
model_path,
algo="KL",
quantizable_op_type=["conv2d"],
is_full_quantize=False,
is_use_cache_file=False,
is_optimize_model=False,
batch_size=10,
batch_nums=10):
place = fluid.CPUPlace()
exe = fluid.Executor(place)
scope = fluid.global_scope()
val_reader = paddle.dataset.mnist.train()
ptq = PostTrainingQuantization(
executor=exe,
model_dir=model_path,
model_filename='model.pdmodel',
params_filename='model.pdiparams',
sample_generator=val_reader,
batch_size=batch_size,
batch_nums=batch_nums,
algo=algo,
quantizable_op_type=quantizable_op_type,
is_full_quantize=is_full_quantize,
optimize_model=is_optimize_model,
is_use_cache_file=is_use_cache_file)
ptq.quantize()
ptq.save_quantized_model(
self.int8_model_path,
model_filename='model.pdmodel',
params_filename='model.pdiparams')
def run_test(self,
model_name,
data_url,
data_md5,
algo,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
batch_size=10,
infer_iterations=10,
quant_iterations=5):
origin_model_path = self.download_model(data_url, data_md5, model_name)
#origin_model_path = os.path.join(origin_model_path, model_name)
print("Start FP32 inference for {0} on {1} images ...".format(
model_name, infer_iterations * batch_size))
(fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
origin_model_path, batch_size, infer_iterations)
print("Start INT8 post training quantization for {0} on {1} images ...".
format(model_name, quant_iterations * batch_size))
self.generate_quantized_model(
origin_model_path, algo, quantizable_op_type, is_full_quantize,
is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
print("Start INT8 inference for {0} on {1} images ...".format(
model_name, infer_iterations * batch_size))
(int8_throughput, int8_latency, int8_acc1) = self.run_program(
self.int8_model_path, batch_size, infer_iterations)
print("---Post training quantization of {} method---".format(algo))
print(
"FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
format(model_name, batch_size, fp32_throughput, fp32_latency,
fp32_acc1))
print(
"INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
format(model_name, batch_size, int8_throughput, int8_latency,
int8_acc1))
sys.stdout.flush()
delta_value = fp32_acc1 - int8_acc1
self.assertLess(delta_value, diff_threshold)
class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
def test_post_training_kl(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "KL"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
class TestPostTraininghistForWhile(TestPostTrainingQuantization):
def test_post_training_hist(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "hist"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
def test_post_training_mse(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "mse"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
def test_post_training_avg(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "avg"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
def test_post_training_min_max(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "min_max"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
def test_post_training_abs_max(self):
model_name = "mnist_while"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5 = "2387390beeb37b51dec041c27b8a681f"
algo = "abs_max"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
diff_threshold, batch_size, infer_iterations,
quant_iterations)
if __name__ == '__main__':
unittest.main()
......@@ -273,6 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
else:
if self._return_list:
data = self._reader.read_next_list()
for i in range(len(data)):
data[i] = data[i]._move_to_list()
data = [
_restore_batch(d, s)
for d, s in zip(data, self._structure_infos[:len(
......@@ -718,6 +720,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
else:
if self._return_list:
data = self._reader.read_next_list()
for i in range(len(data)):
data[i] = data[i]._move_to_list()
data = [
_restore_batch(d, s)
for d, s in zip(data, self._structure_infos[:len(
......
......@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
raise TypeError(
"The type of 'function' should be a function or method, but received {}.".
format(type(function).__name__))
source_code = inspect.getsource(function)
source_code_list, _ = inspect.getsourcelines(function)
source_code_list = [
line for line in source_code_list if not line.lstrip().startswith('#')
]
source_code = ''.join(source_code_list)
if dedent:
source_code = textwrap.dedent(source_code)
......
......@@ -238,7 +238,7 @@ def monkey_patch_varbase():
"Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
grad_tensor.name, grad_tensor.shape, self.name, self.shape)
if paddle.is_compiled_with_xpu():
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self)
core.dygraph_run_backward([scaled_loss], [grad_tensor],
......
......@@ -1999,6 +1999,14 @@ class Executor(object):
fetch_list=fetch_list,
feed_var_name=feed_var_name,
fetch_var_name=fetch_var_name)
main_block = cached_program.block(0)
for op in main_block.ops:
# set the op_role of fetch op to Optimize to avoid
# erase the fetched vars by gc for pipeline
if op.type == 'fetch':
op._set_attr(
'op_role',
core.op_proto_and_checker_maker.OpRole.Optimize)
self._add_program_cache(cache_key, cached_program)
if cached_ctx is None:
fleet_opt = program._pipeline_opt["fleet_opt"]
......@@ -2007,6 +2015,18 @@ class Executor(object):
self._add_ctx_cache(cache_key, cached_ctx)
if feed:
self._feed_data(cached_program, feed, feed_var_name, cached_scope)
from paddle.optimizer.lr import LRScheduler
if hasattr(program, 'lr_sheduler'):
lr_sheduler = program.lr_sheduler
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
lr_value = lr_sheduler()
lr_var = program.global_block().vars[lr_sheduler._var_name]
data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
tensor = core.get_variable_tensor(cached_scope,
lr_sheduler._var_name)
tensor.set(data, self.place)
cached_ctx.run()
if fetch_list:
arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
......
......@@ -1254,7 +1254,10 @@ class GeneratorLoader(DataLoaderBase):
def __next__(self):
try:
if self._return_list:
return self._reader.read_next_list()
data = self._reader.read_next_list()
for i in range(len(data)):
data[i] = data[i]._move_to_list()
return data
else:
return self._reader.read_next()
except StopIteration:
......
......@@ -30,6 +30,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
seed = 2021
epoch = 2
batch_size = 32
linear_size = 10000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
......@@ -45,12 +46,12 @@ paddle.seed(seed)
class MLP(fluid.Layer):
def __init__(self, param_attr=None, bias_attr=None):
def __init__(self, linear_size=10000, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._linear1 = Linear(10000, 10000)
self._linear2 = Linear(10000, 10000)
self._linear3 = Linear(10000, 10)
self._linear1 = Linear(linear_size, linear_size)
self._linear2 = Linear(linear_size, linear_size)
self._linear3 = Linear(linear_size, 10)
def forward(self, inputs):
y = self._linear1(inputs)
......@@ -59,10 +60,10 @@ class MLP(fluid.Layer):
return y
def reader_decorator():
def reader_decorator(linear_size=10000):
def __reader__():
for _ in range(100):
img = np.random.rand(10000).astype('float32')
img = np.random.rand(linear_size).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
......@@ -120,6 +121,9 @@ def train_mlp(model,
use_multiprocess=True)
train_loader.set_sample_list_generator(train_reader)
if sharding_stage == 2:
model.to(device="gpu")
for eop in range(epoch):
model.train()
......@@ -153,9 +157,6 @@ def train_mlp(model,
if all_test and batch_id == 2:
return model.parameters()
if sharding_stage == 2:
model.to(device="gpu")
return model.parameters()
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import ast
import time
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.distributed import fleet
from paddle.fluid.dygraph import nn
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting
seed = 2021
epoch = 2
batch_size = 32
linear_size = 8000
np.random.seed(seed)
paddle.seed(seed)
def train_mlp(model, offload=False):
group = paddle.distributed.new_group([0, 1])
optimizer = optimizer_setting(model=model, use_pure_fp16=True)
model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
scaler = ShardingScaler(scaler, group)
optimizer = ShardingOptimizerStage2(
params=model.parameters(),
optim=optimizer,
group=group,
offload=offload)
model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True)
train_reader = paddle.batch(
reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True)
train_loader.set_sample_list_generator(train_reader)
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
with paddle.amp.auto_cast(True, level='O2'):
out = model(img)
loss = paddle.nn.functional.cross_entropy(
input=out, label=label)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
scaler.scale(avg_loss).backward()
model.grad_scale()
scaler.step(optimizer)
scaler.update()
model.clear_gradients()
for dtype in optimizer.param_storages:
for dst_rank, param_storage in optimizer.param_storages[dtype].items():
param_storage.to(device="gpu", dtype=dtype)
return model.parameters()
def test_sharding_stage2_offload():
mlp = MLP(linear_size)
mlp_offload = MLP(linear_size)
mlp_offload.set_state_dict(mlp.state_dict())
mlp_params = train_mlp(mlp, offload=False)
mlp_offload_params = train_mlp(mlp_offload, offload=True)
for i in range(len(mlp_params)):
for j in range(len(mlp_offload_params)):
if mlp_params[i].name == mlp_offload_params[j].name:
np.testing.assert_allclose(
mlp_params[i].numpy(),
mlp_offload_params[j].numpy(),
rtol=1e-6)
return
if __name__ == '__main__':
test_sharding_stage2_offload()
......@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
from paddle.fluid.dygraph.jit import declarative
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
from ifelse_simple_func import dyfunc_with_if_else
......@@ -344,5 +345,18 @@ class TestFunctionTrainEvalMode(unittest.TestCase):
net.foo.train()
class TestRemoveCommentInDy2St(unittest.TestCase):
def func_with_comment(self):
# Comment1
x = paddle.to_tensor([1, 2, 3])
# Comment2
# Comment3
y = paddle.to_tensor([4, 5, 6])
def test_remove_comment(self):
code_string = func_to_source_code(self.func_with_comment)
self.assertEqual('#' not in code_string, True)
if __name__ == '__main__':
unittest.main()
......@@ -322,14 +322,14 @@ class PassAutoScanTest(AutoScanTest):
"Expected operator list after fusion is {}, but now it's {}".format(
op_list_after_fusion, after_op_list), )
def run_and_statis(
self,
quant=False,
max_examples=100,
reproduce=None,
min_success_num=25,
max_duration=180,
passes=None, ):
def run_and_statis(self,
quant=False,
max_examples=100,
reproduce=None,
min_success_num=25,
max_duration=180,
passes=None,
use_gpu_run_baseline=False):
if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev":
max_examples *= 10
min_success_num *= 10
......@@ -354,7 +354,10 @@ class PassAutoScanTest(AutoScanTest):
return self.sample_program_config(draw)
def run_test(prog_config):
return self.run_test(quant=quant, prog_configs=[prog_config])
return self.run_test(
quant=quant,
prog_configs=[prog_config],
use_gpu_run_baseline=use_gpu_run_baseline)
generator = st.composite(program_generator)
loop_func = given(generator())(run_test)
......@@ -371,8 +374,8 @@ class PassAutoScanTest(AutoScanTest):
logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
successful_ran_programs = int(self.num_ran_programs -
self.num_ignore_tests /
self.num_predictor_kinds)
self.num_ignore_tests / max(
self.num_predictor_kinds, 1))
logging.info(
"Number of successfully ran programs approximately equal to {}".
format(successful_ran_programs))
......@@ -391,7 +394,10 @@ class PassAutoScanTest(AutoScanTest):
format(max_duration))
assert False
def run_test(self, quant=False, prog_configs=None):
def run_test(self,
quant=False,
prog_configs=None,
use_gpu_run_baseline=False):
status = True
for prog_config in prog_configs:
......@@ -413,7 +419,9 @@ class PassAutoScanTest(AutoScanTest):
results: List[Dict[str, np.ndarray]] = []
# baseline: cpu no ir_optim run
base_config = self.create_inference_config(ir_optim=False)
base_config = self.create_inference_config(
ir_optim=False, use_gpu=use_gpu_run_baseline)
logging.info('RUN program_config: ' + str(prog_config))
results.append(
self.run_test_config(model, params, prog_config, base_config,
......
......@@ -109,7 +109,7 @@ class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
def test(self):
self.run_and_statis(
quant=False,
max_examples=100,
max_examples=300,
passes=["adaptive_pool2d_convert_global_pass"],
min_success_num=40)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
import numpy as np
import paddle.inference as paddle_infer
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import hypothesis
from hypothesis import given, settings, seed, example, assume, reproduce_failure
import hypothesis.strategies as st
class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
"""
x_var w(persistable) bias_var(persistable)
\ | /
fc
|
fc_out_var bias_var(persistable)
\ /
elementwise_add bias_var(persistable) scale_var(persistable)
\ | /
layer_norm
/ | \
Y mean_var variance_var
"""
def sample_predictor_configs(self, program_config):
# for gpu
config = self.create_inference_config(use_gpu=True)
yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5)
def sample_program_config(self, draw):
# 1. Generate shape of input:X of fc
x_shape = draw(
st.lists(
st.integers(
min_value=1, max_value=8), min_size=2, max_size=5))
x_shape = [2, 1]
x_rank = len(x_shape)
# 2. Generate attr:in_num_col_dims of fc
in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
# 3. Generate legal shape of input:W/bias of fc
w_shape = draw(
st.lists(
st.integers(
min_value=1, max_value=8), min_size=2, max_size=2))
w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
w_shape = [1, 2]
fc_bias_shape = [w_shape[1], ]
if draw(st.booleans()):
fc_bias_shape.insert(0, 1)
fc_bias_shape = [2, ]
fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
# 4. Generate legal attr:axis/shape of elementwise_add
add_bias_shape = fc_out_shape[:]
axis = draw(st.integers(min_value=-1, max_value=0))
# 5. Generate legal shape of layer_norm
begin_norm_axis = draw(
st.integers(
min_value=1, max_value=len(fc_out_shape) - 1))
layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
epsilon = 1e-5
fc_op = OpConfig(
"fc",
inputs={"Input": ["fc_x"],
"W": ["fc_w"],
"Bias": ["fc_bias"]},
outputs={"Out": ["fc_out"]},
in_num_col_dims=in_num_col_dims,
padding_weights=False,
activation_type="",
use_quantizer=False,
use_mkldnn=False, )
add_op = OpConfig(
"elementwise_add",
inputs={"X": ["fc_out"],
"Y": ["add_bias"]},
outputs={"Out": ["add_out"]},
axis=axis, )
layer_norm_op = OpConfig(
"layer_norm",
inputs={
"X": ["add_out"],
"Scale": ["scale"],
"Bias": ["layer_norm_bias"]
},
outputs={
"Y": ["layer_norm_out"],
"Mean": ["layer_norm_mean"],
"Variance": ["layer_norm_var"]
},
begin_norm_axis=begin_norm_axis,
epsilon=epsilon)
ops = [fc_op, add_op, layer_norm_op]
program_config = ProgramConfig(
ops=ops,
weights={
"fc_w": TensorConfig(shape=w_shape),
"fc_bias": TensorConfig(shape=fc_bias_shape),
"add_bias": TensorConfig(shape=add_bias_shape),
"scale": TensorConfig(shape=layer_norm_shape),
"layer_norm_bias": TensorConfig(shape=layer_norm_shape),
},
inputs={"fc_x": TensorConfig(shape=x_shape), },
outputs=ops[-1].outputs["Y"], )
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=300,
passes=["fc_elementwise_layernorm_fuse_pass"],
use_gpu_run_baseline=True)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,72 +12,147 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
import hypothesis
from hypothesis import given, settings, seed, example, assume, reproduce_failure
import hypothesis.strategies as st
class TransposeFlattenConcatFusePassTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
flatt1 = fluid.layers.flatten(trans1)
flatt2 = fluid.layers.flatten(trans2)
concat_out = fluid.layers.concat([flatt1, flatt2])
# There is no parameters for above structure.
# Hence, append a batch_norm to avoid failure caused by load_combined.
out = fluid.layers.batch_norm(concat_out, is_test=True)
self.feeds = {
"data1": np.random.random([5, 5, 5]).astype("float32"),
"data2": np.random.random([5, 5, 5]).astype("float32")
}
self.fetch_list = [out]
class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
"""
x_1_var x_2_var
| |
transpose2 transpose2
| |
flatten2 flatten2
\ /
flatten2_out_var flatten2_out_var
\ /
concat
"""
def test_check_output(self):
# There is no cpu pass for transpose_flatten_concat_fuse
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
def sample_predictor_configs(self, program_config):
# TRT
# after tensorrt_subgraph_pass ,The pass needs to be deleted on TRT
PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
# for gpu
config = self.create_inference_config(use_gpu=True)
yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)
def is_program_valid(self, prog_config):
concat_axis = prog_config.ops[-1].attrs["axis"]
ops_num = len(prog_config.ops) - 1
if ops_num % 2 != 0:
return False
input_num = ops_num // 2
flatten_shape = 0
x_trans_axis = prog_config.ops[0].attrs["axis"]
x_flatten_axis = prog_config.ops[1].attrs["axis"]
for i in range(input_num):
input_name = "transpose2_x" + str(i)
input_shape = prog_config.inputs[input_name].shape
trans_axis = prog_config.ops[i * 2].attrs["axis"]
if x_trans_axis != trans_axis:
return False
# calculate shape after transpose
input_shape = [input_shape[j] for j in trans_axis]
# calculate shape after flateen
flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"]
if x_flatten_axis != flatten_axis:
return False
flatten_shape1 = flatten_shape2 = 1
for j in range(len(input_shape)):
if j < flatten_axis:
flatten_shape1 *= input_shape[j]
else:
flatten_shape2 *= input_shape[j]
if concat_axis == 0:
if i == 0:
flatten_shape = flatten_shape2
elif flatten_shape != flatten_shape2:
return False
else:
if i == 0:
flatten_shape = flatten_shape1
elif flatten_shape != flatten_shape1:
return False
return True
class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
flatt1 = fluid.layers.flatten(trans1, axis=2)
flatt2 = fluid.layers.flatten(trans2, axis=2)
concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
# There is no parameters for above structure.
# Hence, append a batch_norm to avoid failure caused by load_combined.
out = fluid.layers.batch_norm(concat_out, is_test=True)
def sample_program_config(self, draw):
times = draw(st.integers(min_value=1, max_value=6))
concat_axis = draw(st.integers(min_value=0, max_value=1))
ops = []
concat_input = []
inputs = {}
x_shape_rank = draw(st.integers(min_value=2, max_value=5))
# Generate axis of transpose
trans_axis = [j for j in range(x_shape_rank)]
for j in range(x_shape_rank - 1):
if draw(st.booleans()):
trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
# Generate axis of flatten
flatten_axis = draw(
st.integers(
min_value=0, max_value=x_shape_rank - 1))
for i in range(times):
# Generate x_shape of transpose
x_shape = draw(
st.lists(
st.integers(
min_value=1, max_value=10),
min_size=x_shape_rank,
max_size=x_shape_rank))
self.feeds = {
"data1": np.random.random([5, 5, 5]).astype("float32"),
"data2": np.random.random([5, 5, 5]).astype("float32")
}
self.fetch_list = [out]
str_i = str(i)
transpose_op = OpConfig(
"transpose2",
inputs={"X": ["transpose2_x" + str_i], },
axis=trans_axis,
outputs={
"Out": ["trans_out" + str_i],
"XShape": ["trans_shape" + str_i]
}, )
ops.append(transpose_op)
flatten_op = OpConfig(
"flatten2",
inputs={"X": ["trans_out" + str_i], },
axis=flatten_axis,
outputs={
"Out": ["flatten2_out" + str_i],
"XShape": ["xshape" + str_i]
}, )
concat_input.append("flatten2_out" + str_i)
ops.append(flatten_op)
inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)
def test_check_output(self):
# There is no cpu pass for transpose_flatten_concat_fuse
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
concat_op = OpConfig(
"concat",
inputs={
"X": concat_input,
"AxisTensor": [],
},
outputs={"Out": ["concat_out"]},
axis=concat_axis, )
self.assertTrue(
PassVersionChecker.IsCompatible(
'transpose_flatten_concat_fuse_pass'))
ops.append(concat_op)
program_config = ProgramConfig(
ops=ops,
weights={},
inputs=inputs,
outputs=ops[-1].outputs["Out"], )
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=300,
passes=["transpose_flatten_concat_fuse_pass"])
if __name__ == "__main__":
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import copy
import numpy as np
import paddle
import paddle.nn as nn
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
import paddle.fluid.core as core
from paddle.fluid import layers
from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImplContainer
from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImpl
from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
from paddle.distributed.auto_parallel.dist_op import DistributedOperator
paddle.enable_static()
device = "gpu" if core.is_compiled_with_cuda() else "cpu"
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
def forward(self, input):
out = self.norm(input)
out = self.linear0(out)
out = F.gelu(out, approximate=True)
out = self.linear1(out)
return out
def mlp_forward(train_program, start_program):
with static.program_guard(train_program,
start_program), utils.unique_name.guard():
batch_size = 4
hidden_size = 1024
sqrt_hidden_size = 32
double_hidden_size = 64
input = static.data(name="input", shape=[8, 8, 16], dtype='int32')
input = paddle.reshape(input, [hidden_size])
input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size])
embedding = paddle.nn.Embedding(2, batch_size, sparse=True)
input = embedding(input)
input = paddle.reshape(input, [hidden_size, batch_size])
input = paddle.transpose(input, perm=[1, 0])
matmulinput = static.data(
name="matmulinput",
shape=[hidden_size, hidden_size],
dtype='float32')
input = layers.matmul(x=input, y=matmulinput)
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32')
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
initializer_range=0.02)
predict = mlp(input)
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
m = paddle.nn.Softmax()
loss = m(loss)
return loss, train_program, start_program
class Testcompatible(unittest.TestCase):
def test_matmulv2_matmul_2_compatible(self):
valid_op_dist_attr_list = []
program = paddle.static.Program()
startup_program = paddle.static.Program()
loss, program, start_program = mlp_forward(program, startup_program)
with static.program_guard(program,
start_program), utils.unique_name.guard():
matmulx3 = static.data(
name="matmulx3", shape=[6, 2, 6], dtype='float32')
matmuly3 = static.data(
name="matmuly3", shape=[6, 6], dtype='float32')
output1 = paddle.matmul(x=matmulx3, y=matmuly3)
output_1 = layers.matmul(x=matmulx3, y=matmuly3)
matmulx4 = static.data(
name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
matmuly4 = static.data(
name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
output2 = paddle.matmul(x=matmulx4, y=matmuly4)
output_2 = layers.matmul(x=matmulx4, y=matmuly4)
ops = program.global_block().ops
vars = program.global_block().vars
for idx, op in enumerate(ops):
if op.type == 'matmul_v2' or op.type == 'matmul':
dist_op_impl_container = get_distributed_operator_impl_container(
op.type)
impls = dist_op_impl_container.get_impls()
op_dist_attr = OperatorDistributedAttribute()
X = op.input_arg_names[0]
Y = op.input_arg_names[1]
out = op.output_arg_names[0]
if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1])
self.assertTrue(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, 1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
self.assertTrue(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [1, -1, -1])
op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
self.assertTrue(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
self.assertFalse(impls[2].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
def test_matmulv2_matmul_1_compatible(self):
valid_op_dist_attr_list = []
program = paddle.static.Program()
startup_program = paddle.static.Program()
loss, program, start_program = mlp_forward(program, startup_program)
with static.program_guard(program,
start_program), utils.unique_name.guard():
matmulx3 = static.data(
name="matmulx3", shape=[6, 2, 6], dtype='float32')
matmuly3 = static.data(
name="matmuly3", shape=[6, 6], dtype='float32')
output1 = paddle.matmul(x=matmulx3, y=matmuly3)
output_1 = layers.matmul(x=matmulx3, y=matmuly3)
matmulx4 = static.data(
name="matmulx4", shape=[6, 6, 6, 6], dtype='float32')
matmuly4 = static.data(
name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
output2 = paddle.matmul(x=matmulx4, y=matmuly4)
output_2 = layers.matmul(x=matmulx4, y=matmuly4)
ops = program.global_block().ops
vars = program.global_block().vars
for idx, op in enumerate(ops):
if op.type == 'matmul_v2' or op.type == 'matmul':
dist_op_impl_container = get_distributed_operator_impl_container(
op.type)
impls = dist_op_impl_container.get_impls()
op_dist_attr = OperatorDistributedAttribute()
X = op.input_arg_names[0]
Y = op.input_arg_names[1]
out = op.output_arg_names[0]
if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, 1])
op_dist_attr.set_input_dims_mapping(Y, [1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1])
dist_op = DistributedOperator(op, op_dist_attr)
op_dist_attr.set_output_dims_mapping(out, [1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
op_dist_attr.set_input_dims_mapping(Y, [1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
self.assertTrue(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(out, [-1, -1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, 0, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, 1])
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
self.assertTrue(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
self.assertFalse(impls[1].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
def test_matmulv2_matmul_0_compatible(self):
valid_op_dist_attr_list = []
program = paddle.static.Program()
startup_program = paddle.static.Program()
loss, program, start_program = mlp_forward(program, startup_program)
with static.program_guard(program,
start_program), utils.unique_name.guard():
matmulx3 = static.data(
name="matmulx3", shape=[6, 2, 6], dtype='float32')
matmuly3 = static.data(
name="matmuly3", shape=[6, 6], dtype='float32')
output1 = paddle.matmul(x=matmulx3, y=matmuly3)
output_1 = layers.matmul(x=matmulx3, y=matmuly3)
matmulx4 = static.data(
name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
matmuly4 = static.data(
name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
output2 = paddle.matmul(x=matmulx4, y=matmuly4)
output_2 = layers.matmul(x=matmulx4, y=matmuly4)
ops = program.global_block().ops
vars = program.global_block().vars
for idx, op in enumerate(ops):
if op.type == 'matmul_v2' or op.type == 'matmul':
dist_op_impl_container = get_distributed_operator_impl_container(
op.type)
impls = dist_op_impl_container.get_impls()
op_dist_attr = OperatorDistributedAttribute()
X = op.input_arg_names[0]
Y = op.input_arg_names[1]
out = op.output_arg_names[0]
if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
op_dist_attr.set_output_dims_mapping(out, [-1, 1])
self.assertTrue(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [0, 0])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [0, -1])
op_dist_attr.set_output_dims_mapping(out, [1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1])
self.assertTrue(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 0, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, 1])
self.assertTrue(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, 1, 1, 1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
self.assertFalse(impls[0].is_auto_compatible(
DistributedOperator(op, op_dist_attr)))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
from paddle.fluid import dygraph
from paddle import static
paddle.enable_static()
def ref_view_as_complex(x):
real, imag = np.take(x, 0, axis=-1), np.take(x, 1, axis=-1)
return real + 1j * imag
def ref_view_as_real(x):
return np.stack([x.real, x.imag], -1)
class TestViewAsComplexOp(OpTest):
def setUp(self):
self.op_type = "as_complex"
x = np.random.randn(10, 10, 2).astype("float64")
out_ref = ref_view_as_complex(x)
self.out_grad = np.ones(
[10, 10], dtype="float64") + 1j * np.ones(
[10, 10], dtype="float64")
self.inputs = {'X': x}
self.outputs = {'Out': out_ref}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
['X'],
'Out',
user_defined_grads=[ref_view_as_real(self.out_grad)],
user_defined_grad_outputs=[self.out_grad])
class TestViewAsRealOp(OpTest):
def setUp(self):
self.op_type = "as_real"
real = np.random.randn(10, 10).astype("float64")
imag = np.random.randn(10, 10).astype("float64")
x = real + 1j * imag
out_ref = ref_view_as_real(x)
self.inputs = {'X': x}
self.outputs = {'Out': out_ref}
self.out_grad = np.ones([10, 10, 2], dtype="float64")
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
['X'],
'Out',
user_defined_grads=[ref_view_as_complex(self.out_grad)],
user_defined_grad_outputs=[self.out_grad])
class TestViewAsComplexAPI(unittest.TestCase):
def setUp(self):
self.x = np.random.randn(10, 10, 2)
self.out = ref_view_as_complex(self.x)
def test_dygraph(self):
with dygraph.guard():
x = paddle.to_tensor(self.x)
out_np = paddle.as_complex(x).numpy()
self.assertTrue(np.allclose(self.out, out_np))
def test_static(self):
mp, sp = static.Program(), static.Program()
with static.program_guard(mp, sp):
x = static.data("x", shape=[10, 10, 2], dtype="float64")
out = paddle.as_complex(x)
exe = static.Executor()
exe.run(sp)
[out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
self.assertTrue(np.allclose(self.out, out_np))
class TestViewAsRealAPI(unittest.TestCase):
def setUp(self):
self.x = np.random.randn(10, 10) + 1j * np.random.randn(10, 10)
self.out = ref_view_as_real(self.x)
def test_dygraph(self):
with dygraph.guard():
x = paddle.to_tensor(self.x)
out_np = paddle.as_real(x).numpy()
self.assertTrue(np.allclose(self.out, out_np))
def test_static(self):
mp, sp = static.Program(), static.Program()
with static.program_guard(mp, sp):
x = static.data("x", shape=[10, 10], dtype="complex128")
out = paddle.as_real(x)
exe = static.Executor()
exe.run(sp)
[out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
self.assertTrue(np.allclose(self.out, out_np))
if __name__ == "__main__":
unittest.main()
......@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
paddle.set_flags({
'FLAGS_allocator_strategy': 'auto_growth',
'FLAGS_sync_nccl_allreduce': False,
'FLAGS_cudnn_deterministic': True
'FLAGS_cudnn_deterministic': True,
'FLAGS_use_stream_safe_cuda_allocator': False,
})
def random_tensor(self, shape):
......@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
finally:
graph.reset()
def test_dataloader(self):
if not can_use_cuda_graph():
return
class AutoIncDataset(paddle.io.Dataset):
def __init__(self, n, dtype):
self.n = n
self.dtype = dtype
def __len__(self):
return self.n
def __getitem__(self, idx):
return np.array([idx]).astype(self.dtype)
n = 100
dtype = 'int64'
dataset = AutoIncDataset(n, dtype)
data_loader = paddle.io.DataLoader(
dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
x = None
y = None
graph = None
for i, data in enumerate(data_loader):
if graph is None:
x = data
x = x.cuda()
graph = CUDAGraph()
graph.capture_begin()
y = x * x
graph.capture_end()
else:
x.copy_(data, False)
x = x.cuda()
graph.replay()
actual_x = np.array([[i]]).astype(dtype)
actual_y = np.array([[i * i]]).astype(dtype)
self.assertTrue(np.array_equal(actual_x, x.numpy()))
self.assertTrue(np.array_equal(actual_y, y.numpy()))
if __name__ == "__main__":
unittest.main()
......@@ -336,6 +336,29 @@ class UniformTest11(UniformTest):
name='values', shape=[dims], dtype='float32')
class UniformTestSample(unittest.TestCase):
def setUp(self):
self.init_param()
def init_param(self):
self.low = 3.0
self.high = 4.0
def test_uniform_sample(self):
paddle.disable_static()
uniform = Uniform(low=self.low, high=self.high)
s = uniform.sample([100])
self.assertTrue((s >= self.low).all())
self.assertTrue((s < self.high).all())
paddle.enable_static()
class UniformTestSample2(UniformTestSample):
def init_param(self):
self.low = -5.0
self.high = 2.0
class NormalNumpy(DistributionNumpy):
def __init__(self, loc, scale):
self.loc = np.array(loc)
......
......@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
def test_dygraph_sharding_optimizer_stage2(self):
self.run_mnist_2gpu('dygraph_sharding_stage2.py')
def test_dygraph_sharding_optimizer_stage2_offload(self):
self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
if __name__ == "__main__":
unittest.main()
......@@ -47,6 +47,18 @@ class TestFleetExecutor(unittest.TestCase):
name='y', shape=y_data.shape, dtype=y_data.dtype)
z = x + y
a = 2 * x + 3 * y
loss = paddle.mean(a)
base_lr = 0.1
passes = [30, 60, 80, 90]
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr)
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
opt.minimize(loss)
# TODO: section_program will be removed in the future
empty_program._pipeline_opt = {
"fleet_opt": self.fake_fleet_opt(),
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import Program, program_guard
from op_test import OpTest
paddle.enable_static()
class TestGcdAPI(unittest.TestCase):
def setUp(self):
self.x_np = 12
self.y_np = 20
self.x_shape = [1]
self.y_shape = [1]
def test_static_graph(self):
startup_program = fluid.Program()
train_program = fluid.Program()
with fluid.program_guard(startup_program, train_program):
x = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
out = paddle.gcd(x, y)
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
res = exe.run(fluid.default_main_program(),
feed={'input1': self.x_np,
'input2': self.y_np},
fetch_list=[out])
self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
).all())
def test_dygraph(self):
paddle.disable_static()
x = paddle.to_tensor(self.x_np)
y = paddle.to_tensor(self.y_np)
result = paddle.gcd(x, y)
self.assertEqual(
np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True)
paddle.enable_static()
class TestGcdAPI2(TestGcdAPI):
def setUp(self):
self.x_np = np.arange(6).astype(np.int32)
self.y_np = np.array([20]).astype(np.int32)
self.x_shape = [6]
self.y_shape = [1]
class TestGcdAPI3(TestGcdAPI):
def setUp(self):
self.x_np = 0
self.y_np = 20
self.x_shape = [1]
self.y_shape = [1]
class TestGcdAPI4(TestGcdAPI):
def setUp(self):
self.x_np = 0
self.y_np = 0
self.x_shape = [1]
self.y_shape = [1]
class TestGcdAPI5(TestGcdAPI):
def setUp(self):
self.x_np = 12
self.y_np = -20
self.x_shape = [1]
self.y_shape = [1]
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import Program, program_guard
from op_test import OpTest
paddle.enable_static()
class TestLcmAPI(unittest.TestCase):
def setUp(self):
self.x_np = 12
self.y_np = 20
self.x_shape = [1]
self.y_shape = [1]
def test_static_graph(self):
startup_program = fluid.Program()
train_program = fluid.Program()
with fluid.program_guard(startup_program, train_program):
x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
out = paddle.lcm(x1, x2)
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
res = exe.run(fluid.default_main_program(),
feed={'input1': self.x_np,
'input2': self.y_np},
fetch_list=[out])
self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
).all())
def test_dygraph(self):
paddle.disable_static()
x1 = paddle.to_tensor(self.x_np)
x2 = paddle.to_tensor(self.y_np)
result = paddle.lcm(x1, x2)
self.assertEqual(
np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True)
paddle.enable_static()
class TestLcmAPI2(TestLcmAPI):
def setUp(self):
self.x_np = np.arange(6).astype(np.int32)
self.y_np = np.array([20]).astype(np.int32)
self.x_shape = [6]
self.y_shape = [1]
class TestLcmAPI3(TestLcmAPI):
def setUp(self):
self.x_np = 0
self.y_np = 20
self.x_shape = [1]
self.y_shape = [1]
class TestLcmAPI4(TestLcmAPI):
def setUp(self):
self.x_np = 0
self.y_np = 0
self.x_shape = [1]
self.y_shape = [1]
class TestLcmAPI5(TestLcmAPI):
def setUp(self):
self.x_np = 12
self.y_np = -20
self.x_shape = [1]
self.y_shape = [1]
......@@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401
from ..fluid.param_attr import ParamAttr # noqa: F401
from ..fluid.layers.tensor import create_parameter # noqa: F401
from ..fluid.core import CPUPlace # noqa: F401
from ..fluid.core import IPUPlace # noqa: F401
from ..fluid.core import CUDAPlace # noqa: F401
from ..fluid.core import CUDAPinnedPlace # noqa: F401
from ..fluid.core import NPUPlace # noqa: F401
......
......@@ -111,6 +111,9 @@ from .manipulation import unbind # noqa: F401
from .manipulation import roll # noqa: F401
from .manipulation import chunk # noqa: F401
from .manipulation import tensordot # noqa: F401
from .manipulation import as_complex # noqa: F401
from .manipulation import as_real # noqa: F401
from .math import abs # noqa: F401
from .math import acos # noqa: F401
from .math import asin # noqa: F401
......@@ -194,6 +197,8 @@ from .math import lerp # noqa: F401
from .math import lerp_ # noqa: F401
from .math import rad2deg # noqa: F401
from .math import deg2rad # noqa: F401
from .math import gcd # noqa: F401
from .math import lcm # noqa: F401
from .math import diff # noqa: F401
from .math import angle # noqa: F401
......@@ -409,6 +414,12 @@ tensor_method_func = [ #noqa
'multi_dot',
'solve',
'triangular_solve',
'as_complex',
'as_real',
'rad2deg',
'deg2rad',
'gcd',
'lcm',
'diff',
'lerp',
'lerp_',
......
......@@ -34,6 +34,7 @@ from ..fluid import layers
from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
import paddle
from paddle import _C_ops
from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
__all__ = []
......@@ -2488,3 +2489,94 @@ def tensordot(x, y, axes=2, name=None):
[contraction_size, not_contraction_size_y])
out = x.matmul(y).reshape(shape_out)
return out
def as_complex(x, name=None):
"""Transform a real tensor to a complex tensor.
The data type of the input tensor is 'float32' or 'float64', and the data
type of the returned tensor is 'complex64' or 'complex128', respectively.
The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e.
the size of the last axis shoule be 2, which represent the real and imag part
of a complex number. The shape of the returned tensor is ``(*,)``.
Args:
x (Tensor): The input tensor. Data type is 'float32' or 'float64'.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
Examples:
.. code-block:: python
import paddle
x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
y = paddle.as_complex(x)
print(y.numpy())
# [[ 0. +1.j 2. +3.j 4. +5.j]
# [ 6. +7.j 8. +9.j 10.+11.j]]
"""
if in_dygraph_mode():
return paddle._C_ops.as_complex(x)
check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
op_type = "as_complex"
helper = LayerHelper(op_type, **locals())
inputs = {"X": x}
out = helper.create_variable_for_type_inference(
dtype=_real_to_complex_dtype(x.dtype))
outputs = {"Out": out}
attrs = {}
helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
return out
def as_real(x, name=None):
"""Transform a complex tensor to a real tensor.
The data type of the input tensor is 'complex64' or 'complex128', and the data
type of the returned tensor is 'float32' or 'float64', respectively.
When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
the shape of the output tensor is ``(*, 2)``, i.e. the shape of the output is
the shape of the input appended by an extra ``2``.
Args:
x (Tensor): The input tensor. Data type is 'complex64' or 'complex128'.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
Examples:
.. code-block:: python
import paddle
x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
y = paddle.as_complex(x)
z = paddle.as_real(y)
print(z.numpy())
# [[[ 0. 1.]
# [ 2. 3.]
# [ 4. 5.]]
# [[ 6. 7.]
# [ 8. 9.]
# [10. 11.]]]
"""
if in_dygraph_mode():
return paddle._C_ops.as_real(x)
check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
op_type = "as_real"
helper = LayerHelper(op_type, **locals())
inputs = {"X": x}
out = helper.create_variable_for_type_inference(
dtype=_complex_to_real_dtype(x.dtype))
outputs = {"Out": out}
helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
return out
......@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
lerp(x, y, weight) = x + weight * (y - x).
Args:
x (Tensor): An N-D Tensor, the data type is float32, float64.
y (Tensor): An N-D Tensor, the data type is float32, float64.
weight (float|Tensor): the weight for the interpolation formula.
x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
......@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
return out
def gcd(x, y, name=None):
"""
Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
Both x and y must have integer types.
Note:
gcd(0,0)=0, gcd(0, y)=|y|
Args:
x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8.
If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
out (Tensor): An N-D Tensor, the data type is the same with input.
Examples:
.. code-block:: python
import paddle
import numpy as np
x1 = paddle.to_tensor(12)
x2 = paddle.to_tensor(20)
paddle.gcd(x1, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [4])
x3 = paddle.to_tensor(np.arange(6))
paddle.gcd(x3, x2)
# Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [20, 1 , 2 , 1 , 4 , 5])
x4 = paddle.to_tensor(0)
paddle.gcd(x4, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [20])
paddle.gcd(x4, x4)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
x5 = paddle.to_tensor(-20)
paddle.gcd(x1, x5)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [4])
"""
shape = paddle.broadcast_shape(x.shape, y.shape)
x = paddle.broadcast_to(x, shape)
y = paddle.broadcast_to(y, shape)
x = paddle.abs(x)
y = paddle.abs(y)
def _gcd_cond_fn(x, y):
return paddle.any(y != 0)
def _gcd_body_fn(x, y):
# paddle.mod will raise an error when any element of y is 0. To avoid
# that, we change those zeros to ones. Their values don't matter because
# they won't be used.
y_not_equal_0 = (y != 0)
y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype))
x, y = (paddle.where(y_not_equal_0, y, x),
paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
if in_dygraph_mode():
while _gcd_cond_fn(x, y):
x, y = _gcd_body_fn(x, y)
return x
else:
check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
return out
def lcm(x, y, name=None):
"""
Computes the element-wise least common multiple (LCM) of input |x| and |y|.
Both x and y must have integer types.
Note:
lcm(0,0)=0, lcm(0, y)=0
Args:
x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8.
If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
out (Tensor): An N-D Tensor, the data type is the same with input.
Examples:
.. code-block:: python
import paddle
import numpy as np
x1 = paddle.to_tensor(12)
x2 = paddle.to_tensor(20)
paddle.lcm(x1, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [60])
x3 = paddle.to_tensor(np.arange(6))
paddle.lcm(x3, x2)
# Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0, 20, 20, 60, 20, 20])
x4 = paddle.to_tensor(0)
paddle.lcm(x4, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
paddle.lcm(x4, x4)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
x5 = paddle.to_tensor(-20)
paddle.lcm(x1, x5)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [60])
"""
d = paddle.gcd(x, y)
# paddle.mod will raise an error when any element of y is 0. To avoid
# that, we change those zeros to ones. Their values don't matter because
# they won't be used.
d_equal_0 = paddle.equal(d, 0)
d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d)
out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe)
return out
def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
r"""
Computes the n-th forward difference along the given axis.
......@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
return out
def angle(x, name=None):
r"""
Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while
......@@ -2965,7 +3097,7 @@ def angle(x, name=None):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
Tensor: An N-D Tensor of real data type with the same precision as that of x's data type.
Examples:
.. code-block:: python
......
......@@ -76,7 +76,7 @@
infer_meta :
func : MatmulInferMeta
kernel :
func : matmul_v2
func : matmul
- api : mean
args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
......
......@@ -345,6 +345,7 @@ def source_include(header_file_path):
#include "glog/logging.h"
#include "paddle/pten/api/lib/api_registry.h"
#include "paddle/pten/api/lib/kernel_declare.h"
#include "paddle/pten/api/lib/kernel_dispatch.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/core/kernel_registry.h"
......@@ -353,22 +354,6 @@ def source_include(header_file_path):
"""
def module_declare():
return """
PT_DECLARE_MODULE(CreationCPU);
PT_DECLARE_MODULE(LinalgCPU);
PT_DECLARE_MODULE(ManipulationCPU);
PT_DECLARE_MODULE(MathCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(CreationCUDA);
PT_DECLARE_MODULE(LinalgCUDA);
PT_DECLARE_MODULE(ManipulationCUDA);
PT_DECLARE_MODULE(MathCUDA);
#endif
"""
def api_register():
return """
PT_REGISTER_API(Creation);
......@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
include_header_file = "paddle/pten/api/include/api.h"
source_file.write(source_include(include_header_file))
source_file.write(module_declare())
source_file.write(namespace[0])
for api in apis:
......
......@@ -202,7 +202,7 @@ HIGH_PARALLEL_JOB_NEW = [
'test_fleet_runtime',
'test_rnn_cudnn_params_packing',
'test_mkldnn_placement_pass',
'test_fc_elementwise_layernorm_fuse_pass',
'test_fc_elementwise_layernorm_fuse_pass_cc',
'program_desc_test',
'test_simplify_with_basic_ops_pass',
'test_dygraph_mode_of_unittest',
......@@ -1417,7 +1417,7 @@ CPU_PARALLEL_JOB = [
'test_fc_mkldnn_op',
'test_fc_lstm_fuse_pass',
'test_fc_gru_fuse_pass',
'test_fc_elementwise_layernorm_fuse_pass',
'test_fc_elementwise_layernorm_fuse_pass_cc',
'test_fc_bf16_mkldnn_op',
'test_executor_feed_non_tensor',
'test_executor_check_feed',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册