未验证 提交 a7707efb 编写于 作者: J jjyaoao 提交者: GitHub

delete paddle/fluid/operators/*_npu.* (#52678)

* delete paddle/fluid/operators/*_npu.*

* try pass CI

* try pass CI
上级 2b0fffc2
......@@ -96,7 +96,7 @@ register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 genera
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
target_link_libraries(run_program_op cuda_graph_with_memory_pool)
op_library(quantize_linear_op DEPS phi)
op_library(save_combine_op DEPS string_array phi)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AbsNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("Abs",
{
*x,
},
{*out},
{});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class AbsGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
abs,
ops::AbsNPUKernel<plat::NPUDeviceContext, float>,
ops::AbsNPUKernel<plat::NPUDeviceContext, plat::float16>);
REGISTER_OP_NPU_KERNEL(
abs_grad,
ops::AbsGradNPUKernel<plat::NPUDeviceContext, float>,
ops::AbsGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T>
static void TranposeNPU(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
std::vector<int64_t>* perm,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
out->mutable_data<T>(ctx.GetPlace());
NpuOpRunner runner;
runner.SetType("Transpose")
.AddInput(in)
.AddInput(std::move(*perm))
.AddOutput(*out)
.Run(stream);
}
static void CastToInt64(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
out->mutable_data<int64_t>(ctx.GetPlace());
NpuOpRunner runner;
runner.SetType("Cast")
.AddInput(in)
.AddOutput(*out)
.AddAttr("dst_type", ACL_INT64)
.Run(stream);
}
static void CastToFP32(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
out->mutable_data<float>(ctx.GetPlace());
NpuOpRunner runner;
runner.SetType("Cast")
.AddInput(in)
.AddOutput(*out)
.AddAttr("dst_type", ACL_FLOAT)
.Run(stream);
}
template <typename T>
class ArgsortNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto* indices = ctx.Output<phi::DenseTensor>("Indices");
int axis = ctx.Attr<int>("axis");
bool descending = ctx.Attr<bool>("descending");
auto in_dims = input->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
framework::NPUAttributeMap attr = {{"axis", -1},
{"descending", descending}};
phi::DenseTensor indices_tmp(phi::DataType::INT32);
indices_tmp.Resize(indices->dims());
if (framework::TransToProtoVarType(input->dtype()) ==
framework::proto::VarType::INT64) {
phi::DenseTensor input_fp32(phi::DataType::FLOAT32);
input_fp32.Resize(input->dims());
CastToFP32(ctx, stream, *input, &input_fp32);
phi::DenseTensor output_fp32(phi::DataType::FLOAT32);
output_fp32.Resize(output->dims());
if (axis == -1 || axis + 1 == in_dims.size()) {
output_fp32.mutable_data<float>(ctx.GetPlace());
indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr);
runner.Run(stream);
CastToInt64(ctx, stream, output_fp32, output);
} else {
std::vector<int64_t> perm;
for (int64_t i = 0; i < in_dims.size(); i++) {
perm.emplace_back(i);
}
std::swap(perm[axis], perm[in_dims.size() - 1]);
std::vector<int64_t> shape;
for (size_t i = 0; i < perm.size(); i++) {
shape.emplace_back(in_dims[perm[i]]);
}
auto trans_dims = phi::make_ddim(shape);
phi::DenseTensor trans_input(input_fp32.type());
trans_input.Resize(trans_dims);
TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
phi::DenseTensor trans_output(input_fp32.type());
phi::DenseTensor trans_indices(phi::DataType::INT32);
trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
const auto& runner = NpuOpRunner(
"Sort", {trans_input}, {trans_output, trans_indices}, attr);
runner.Run(stream);
TranposeNPU<float>(ctx, stream, &perm, trans_output, &output_fp32);
TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
CastToInt64(ctx, stream, output_fp32, output);
}
} else {
if (axis == -1 || axis + 1 == in_dims.size()) {
output->mutable_data<T>(ctx.GetPlace());
indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
runner.Run(stream);
} else {
std::vector<int64_t> perm;
for (int64_t i = 0; i < in_dims.size(); i++) {
perm.emplace_back(i);
}
std::swap(perm[axis], perm[in_dims.size() - 1]);
std::vector<int64_t> shape;
for (size_t i = 0; i < perm.size(); i++) {
shape.emplace_back(in_dims[perm[i]]);
}
auto trans_dims = phi::make_ddim(shape);
phi::DenseTensor trans_input(input->type());
trans_input.Resize(trans_dims);
TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
phi::DenseTensor trans_output(input->type());
phi::DenseTensor trans_indices(phi::DataType::INT32);
trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
const auto& runner = NpuOpRunner(
"Sort", {trans_input}, {trans_output, trans_indices}, attr);
runner.Run(stream);
TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
}
}
CastToInt64(ctx, stream, indices_tmp, indices);
}
};
template <typename T, typename Type>
static void FullAssignNPU(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
const framework::DDim in_dims,
const phi::DenseTensor& input,
const phi::DenseTensor& indices,
phi::DenseTensor* t_out) {
const int64_t input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
phi::DenseTensor input_tmp;
input_tmp.ShareDataWith(input);
input_tmp.Resize(
phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
phi::DenseTensor indices_tmp;
indices_tmp.ShareDataWith(indices);
indices_tmp.Resize(
phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
std::vector<int64_t> indexs_value;
for (Type i = 0; i < input_height; i++) {
indexs_value.push_back(i * input_width);
}
phi::DenseTensor indexs_tmp(indices.type());
framework::TensorFromVector<int64_t>(
indexs_value, ctx.device_context(), &indexs_tmp);
indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
phi::DenseTensor indices_index(indices.type());
indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
const auto& runner_add =
NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
runner_add.Run(stream);
indices_index.Resize(
phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
t_out->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor out_tmp(t_out->type());
out_tmp.ShareDataWith(*t_out);
const auto& runner = NpuOpRunner("TensorScatterUpdate",
{input_tmp, indices_index, input_tmp},
{out_tmp},
{});
runner.Run(stream);
}
template <typename T>
class ArgsortGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* indices = ctx.Input<phi::DenseTensor>("Indices");
auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dO = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
int axis = ctx.Attr<int>("axis");
auto in_dims = indices->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (dO->numel() == 0) return;
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (axis == -1 || axis + 1 == in_dims.size()) {
FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
} else {
std::vector<int64_t> perm;
for (int64_t i = 0; i < in_dims.size(); i++) {
perm.emplace_back(i);
}
std::swap(perm[axis], perm[in_dims.size() - 1]);
std::vector<int64_t> shape;
for (size_t i = 0; i < perm.size(); i++) {
shape.emplace_back(in_dims[perm[i]]);
}
auto trans_dims = phi::make_ddim(shape);
phi::DenseTensor trans_dout(dO->type());
phi::DenseTensor trans_ids(indices->type());
trans_dout.Resize(trans_dims);
trans_ids.Resize(trans_dims);
TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
phi::DenseTensor trans_dx(dO->type());
trans_dx.Resize(trans_dims);
FullAssignNPU<T, int64_t>(
ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(argsort,
ops::ArgsortNPUKernel<float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ArgsortNPUKernel<int64_t>,
#endif
ops::ArgsortNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(argsort_grad,
ops::ArgsortGradNPUKernel<float>,
ops::ArgsortGradNPUKernel<paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/assign_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace framework {
class OpDesc;
class Variable;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AssignNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
assign,
ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP_ITSELF(assign);
USE_OP_DEVICE_KERNEL(assign, NPU);
template <typename T>
void Compare(f::Scope* scope,
const p::DeviceContext& ctx,
std::string op_type) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<phi::DenseTensor>();
std::vector<T> init;
init.push_back(static_cast<T>(1.0));
init.push_back(static_cast<T>(2.0));
init.push_back(static_cast<T>(3.0));
init.push_back(static_cast<T>(4.0));
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({4});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<phi::DenseTensor>();
auto op =
f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
op->Run(*scope, place);
std::vector<T> out_vec;
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
}
TEST(assign, NPU_fp32) {
f::Scope scope;
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "assign");
}
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h"
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(assign_value,
ops::AssignValueKernel<bool>,
ops::AssignValueKernel<int>,
ops::AssignValueKernel<int64_t>,
ops::AssignValueKernel<float>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
namespace paddle {
namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T>
class NPUBatchNormOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
bool test_mode = is_test && (!trainable_stats);
bool training = !test_mode && !use_global_stats;
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<phi::DenseTensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
(x_dims.size() == 4UL || x_dims.size() == 3UL),
true,
platform::errors::InvalidArgument(
"The input tensor X's dimension must equal to 3 or 4. "
" But got X's shape = [%s], X's dimension = [%d].",
x_dims.to_str(),
x_dims.size()));
const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
auto *y = ctx.Output<phi::DenseTensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto x_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
auto y_tesnor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(y->dims(), dev_ctx);
x_tensor.ShareDataWith(*x);
y_tesnor.ShareDataWith(*y);
if (data_layout == DataLayout::kNHWC) {
x_tensor.set_layout(DataLayout::kNHWC);
y_tesnor.set_layout(DataLayout::kNHWC);
}
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (!training) {
const auto &runner_infer =
NpuOpRunner("BNInfer",
{x_tensor, *scale, *bias, *running_mean, *running_var},
{y_tesnor},
{{"epsilon", epsilon}});
runner_infer.Run(stream);
} else {
auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
mean_out->mutable_data<float>(ctx.GetPlace());
variance_out->mutable_data<float>(ctx.GetPlace());
saved_mean->mutable_data<float>(ctx.GetPlace());
saved_variance->mutable_data<float>(ctx.GetPlace());
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
phi::DenseTensor mom_cpu;
paddle::framework::TensorCopySync(
*mom_tensor, platform::CPUPlace(), &mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
phi::DenseTensor sum, square_sum;
sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
// BNTrainingReduce ONLY support rank = 4
if (x->dims().size() == 3) {
auto x_shape_vec = phi::vectorize(x->dims());
if (data_layout == DataLayout::kNCHW) {
x_shape_vec.push_back(1); // expand NCL -> NCL1
} else {
x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C
}
auto x_new_shape = phi::make_ddim(x_shape_vec);
x_tensor.Resize(x_new_shape);
x_tensor.Resize(x_new_shape);
}
const auto &runner_reduce = NpuOpRunner("BNTrainingReduce",
{x_tensor},
{sum, square_sum},
{{"epsilon", epsilon}});
runner_reduce.Run(stream);
const auto &runner_update = NpuOpRunner(
"BNTrainingUpdate",
{x_tensor,
sum,
square_sum,
*scale,
*bias,
*running_mean,
*running_var},
{y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
{{"factor", momentum}, {"epsilon", epsilon}});
runner_update.Run(stream);
}
}
};
template <typename T>
class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<phi::DenseTensor>("X");
const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
// SavedVariance have been reverted in forward operator
const auto *saved_inv_variance =
ctx.Input<phi::DenseTensor>("SavedVariance");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
const float epsilon = ctx.Attr<float>("epsilon");
DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_scale =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
use_global_stats = is_test || use_global_stats;
auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto x_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
auto dy_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_y->dims(), dev_ctx);
x_tensor.ShareDataWith(*x);
dy_tensor.ShareDataWith(*d_y);
if (data_layout == DataLayout::kNHWC) {
x_tensor.set_layout(DataLayout::kNHWC);
dy_tensor.set_layout(DataLayout::kNHWC);
}
auto scale_grad_tmp =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(scale->dims(), dev_ctx);
auto bias_grad_tmp =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(bias->dims(), dev_ctx);
if (d_scale == nullptr) {
d_scale = &scale_grad_tmp;
}
if (d_bias == nullptr) {
d_bias = &bias_grad_tmp;
}
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (d_scale && d_bias) {
d_scale->mutable_data<float>(ctx.GetPlace());
d_bias->mutable_data<float>(ctx.GetPlace());
if (use_global_stats) {
const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
const auto &runner_update =
NpuOpRunner("BNTrainingUpdateGrad",
{dy_tensor, x_tensor, *running_mean, *running_variance},
{*d_scale, *d_bias},
{{"epsilon", epsilon}});
runner_update.Run(stream);
} else {
const auto &runner_update =
NpuOpRunner("BNTrainingUpdateGrad",
{dy_tensor, x_tensor, *saved_mean, *saved_inv_variance},
{*d_scale, *d_bias},
{{"epsilon", epsilon}});
runner_update.Run(stream);
}
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto dx_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_x->dims(), dev_ctx);
dx_tensor.ShareDataWith(*d_x);
if (data_layout == DataLayout::kNHWC) {
dx_tensor.set_layout(DataLayout::kNHWC);
}
if (use_global_stats) {
if (x->dims().size() == 3) {
// BNInferGrad only support x rank = 4,
auto x_shape_vec = phi::vectorize(d_x->dims());
if (data_layout == DataLayout::kNCHW) {
x_shape_vec.push_back(1); // expand NCL -> NCL1
} else {
x_shape_vec.insert(x_shape_vec.begin() + 2,
1); // expand NLC -> NL1C
}
auto x_new_shape = phi::make_ddim(x_shape_vec);
dx_tensor.Resize(x_new_shape);
dy_tensor.Resize(x_new_shape);
}
const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
const auto &runner_infer =
NpuOpRunner("BNInferGrad",
{dy_tensor, *scale, *running_var},
{dx_tensor},
{{"epsilon", epsilon}});
runner_infer.Run(stream);
} else {
const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad",
{dy_tensor,
x_tensor,
*d_scale,
*d_bias,
*scale,
*saved_mean,
*saved_inv_variance},
{dx_tensor},
{{"epsilon", epsilon}});
runner_reduce.Run(stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(batch_norm,
ops::NPUBatchNormOpKernel<float>,
ops::NPUBatchNormOpKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(batch_norm_grad,
ops::NPUBatchNormGradOpKernel<float>,
ops::NPUBatchNormGradOpKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class BCELossNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* labels = ctx.Input<phi::DenseTensor>("Label");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner =
NpuOpRunner("BinaryCrossEntropy",
{*x, *labels},
{*out},
{{"reduction", static_cast<std::string>("none")}});
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class BCELossGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* labels = ctx.Input<phi::DenseTensor>("Label");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner =
NpuOpRunner("BinaryCrossEntropyGrad",
{*x, *labels, *dout},
{*dx},
{{"reduction", static_cast<std::string>("none")}});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
bce_loss,
ops::BCELossNPUKernel<plat::NPUDeviceContext, float>,
ops::BCELossNPUKernel<plat::NPUDeviceContext, plat::float16>);
REGISTER_OP_NPU_KERNEL(
bce_loss_grad,
ops::BCELossGradNPUKernel<plat::NPUDeviceContext, float>,
ops::BCELossGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
namespace ops = paddle::operators;
using NPUCtx = paddle::platform::NPUDeviceContext;
REGISTER_OP_NPU_KERNEL(beam_search,
ops::BeamSearchOpKernel<float, NPUCtx>,
ops::BeamSearchOpKernel<double, NPUCtx>,
ops::BeamSearchOpKernel<int, NPUCtx>,
ops::BeamSearchOpKernel<int64_t, NPUCtx>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
namespace paddle {
namespace operators {
static std::map<framework::proto::VarType::Type, aclDataType>
DTYPE_2_ACL_DTYPE = {
{framework::proto::VarType::BOOL, ACL_BOOL},
{framework::proto::VarType::INT16, ACL_INT16},
{framework::proto::VarType::INT32, ACL_INT32},
{framework::proto::VarType::INT64, ACL_INT64},
{framework::proto::VarType::FP16, ACL_FLOAT16},
{framework::proto::VarType::FP32, ACL_FLOAT},
{framework::proto::VarType::FP64, ACL_DOUBLE},
};
template <typename DeviceContext, typename T>
class CastNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
int dtype = ctx.Attr<int>("out_dtype");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
if (framework::TransToProtoVarType(x->dtype()) == dtype) {
// NOTE(zhiqiu): NPU cast op may result in wrong value, so
// add special case here.
VLOG(4) << "cast to same dtype:" << dtype;
out->mutable_data(place, x->type());
framework::TensorCopy(
*x,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
out);
return;
}
auto iter = DTYPE_2_ACL_DTYPE.find(
static_cast<framework::proto::VarType::Type>(dtype));
int aclDtype = iter->second;
if (dtype == framework::proto::VarType::FP32) {
out->mutable_data<float>(place);
} else if (dtype == framework::proto::VarType::FP16) {
out->mutable_data<paddle::platform::float16>(place);
} else if (dtype == framework::proto::VarType::INT16) {
out->mutable_data<int16_t>(place);
} else if (dtype == framework::proto::VarType::INT32) {
out->mutable_data<int32_t>(place);
} else if (dtype == framework::proto::VarType::INT64) {
out->mutable_data<int64_t>(place);
} else if (dtype == framework::proto::VarType::FP64) {
out->mutable_data<double>(place);
} else if (dtype == framework::proto::VarType::BOOL) {
out->mutable_data<bool>(place);
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner(
"Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
cast,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/clip_by_norm_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class NPUClipByNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto max_norm = context.Attr<float>("max_norm");
auto in_var = context.InputVar("X");
if (!(in_var->IsType<phi::DenseTensor>())) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid input variable type, only support LodTensor"
"type, but got type is %s.",
framework::ToTypeName(in_var->Type())));
}
auto place = context.GetPlace();
auto& dev_ctx =
context.template device_context<paddle::platform::NPUDeviceContext>();
auto stream = dev_ctx.stream();
auto* input = context.Input<phi::DenseTensor>("X");
auto* output = context.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(place);
PADDLE_ENFORCE_NOT_NULL(input,
platform::errors::InvalidArgument(
"Input(X) of ClipByNormOp should not be null. "
"Please check if it is created correctly."));
phi::DenseTensor square_sum(input->type());
square_sum.mutable_data<T>(framework::DDim({1}), place);
const auto& x_dims = input->dims();
std::vector<int> axis;
for (int i = 0; i < x_dims.size(); ++i) {
axis.push_back(i);
}
const auto& square_sum_runner =
NpuOpRunner("SquareSumV1",
{*input},
{square_sum},
{{"axis", axis}, {"keep_dims", false}});
square_sum_runner.Run(stream);
phi::DenseTensor x_norm(input->type());
x_norm.mutable_data<T>(framework::DDim({1}), place);
const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
x_norm_runner.Run(stream);
phi::DenseTensor x_norm_t;
framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
if (x_norm_v <= max_norm) {
framework::TensorCopy(*input, place, dev_ctx, output);
} else {
auto epsilon = x_norm_v <= static_cast<float>(1e-30)
? static_cast<float>(1e-6)
: static_cast<float>(0);
float scaling = max_norm / (x_norm_v + epsilon);
const auto& muls_runner =
NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}});
muls_runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
clip_by_norm,
ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext, float>,
ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext,
plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ClipNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto min_tensor =
ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
auto max_tensor =
ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
phi::DenseTensor min_tensor_temp(x->type());
phi::DenseTensor max_tensor_temp(x->type());
if (min_tensor == nullptr) {
auto min_value = static_cast<T>(ctx.Attr<float>("min"));
min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&min_tensor_temp, min_value);
min_tensor = &min_tensor_temp;
}
if (max_tensor == nullptr) {
auto max_value = static_cast<T>(ctx.Attr<float>("max"));
max_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&max_tensor_temp, max_value);
max_tensor = &max_tensor_temp;
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner =
NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {});
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class ClipGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
auto* min_tensor =
ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
auto* max_tensor =
ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
auto min_val = ctx.Attr<float>("min");
if (min_tensor) {
phi::DenseTensor min_data;
framework::TensorCopy(
*min_tensor,
platform::CPUPlace(),
ctx.template device_context<platform::DeviceContext>(),
&min_data);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
min_val = static_cast<float>(min_data.data<T>()[0]);
}
auto max_val = ctx.Attr<float>("max");
if (max_tensor) {
phi::DenseTensor max_data;
framework::TensorCopy(
*max_tensor,
platform::CPUPlace(),
ctx.template device_context<platform::DeviceContext>(),
&max_data);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
max_val = static_cast<float>(max_data.data<T>()[0]);
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner =
NpuOpRunner("HardtanhGrad",
{*x, *dout},
{*dx},
{{"min_val", min_val}, {"max_val", max_val}});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
clip,
ops::ClipNPUKernel<plat::NPUDeviceContext, float>,
ops::ClipNPUKernel<plat::NPUDeviceContext, plat::float16>);
REGISTER_OP_NPU_KERNEL(
clip_grad,
ops::ClipGradNPUKernel<plat::NPUDeviceContext, float>,
ops::ClipGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ConcatNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
auto axis = ctx.Attr<int>("axis");
if (ctx.HasInput("AxisTensor")) {
PADDLE_THROW(platform::errors::NotFound(
"The AxisTensor is not supported on NPU now."));
}
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
std::vector<phi::DenseTensor> inputs;
std::vector<std::string> names;
for (size_t i = 0; i < ins.size(); ++i) {
if (ins[i] && ins[i]->numel() > 0) {
inputs.push_back(*ins[i]);
names.push_back("x" + std::to_string(i));
} else {
continue;
}
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
NpuOpRunner runner{
"ConcatD",
{inputs},
{*out},
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
runner.AddInputNames(names);
runner.Run(stream);
}
};
template <typename T>
class ConcatGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
auto axis = ctx.Attr<int>("axis");
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
int offset = 0;
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
for (size_t j = 0; j < outs.size(); ++j) {
// For stop gradient
// get output tensor that the name is not kEmptyVarName
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
std::vector<int> offsets;
std::vector<int> sizes;
for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
if (dim == axis) {
offsets.push_back(offset);
sizes.push_back(ins[j]->dims()[dim]);
} else {
offsets.push_back(0);
sizes.push_back(ins[j]->dims()[dim]);
}
}
const auto& runner =
NpuOpRunner("SliceD",
{*out_grad},
{*outs[j]},
{{"offsets", offsets}, {"size", sizes}});
runner.Run(stream);
}
if (ins[j]->numel() != 0UL) {
offset += ins[j]->dims()[axis];
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(concat,
ops::ConcatNPUKernel<float>,
ops::ConcatNPUKernel<paddle::platform::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ConcatNPUKernel<int64_t>,
#endif
ops::ConcatNPUKernel<int>);
REGISTER_OP_NPU_KERNEL(concat_grad,
ops::ConcatGradNPUKernel<float>,
ops::ConcatGradNPUKernel<paddle::platform::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ConcatGradNPUKernel<int64_t>,
#endif
ops::ConcatGradNPUKernel<int>);
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace paddle {
namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T>
class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
std::vector<int> output_padding =
ctx.Attr<std::vector<int>>("output_padding");
const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
const std::string data_format = ctx.Attr<std::string>("data_format");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
// check dimension
const bool channel_last = data_format == "NHWC";
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(
&padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
// construct NPU attr
std::vector<int> strides(4, 1);
std::vector<int> dilations(4, 1);
phi::DenseTensor input_tensor, output_tensor;
input_tensor.ShareDataWith(*input);
output_tensor.ShareDataWith(*output);
if (channel_last) {
input_tensor.set_layout(DataLayout::kNHWC);
output_tensor.set_layout(DataLayout::kNHWC);
strides[1] = stride[0];
strides[2] = stride[1];
dilations[1] = dilation[0];
dilations[2] = dilation[1];
} else {
strides[2] = stride[0];
strides[3] = stride[1];
dilations[2] = dilation[0];
dilations[3] = dilation[1];
}
for (auto i = output_padding.size(); i < 4; ++i) {
output_padding.insert(output_padding.begin(), 0);
}
auto output_dim_vec = phi::vectorize(output_tensor.dims());
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
const auto& runner = NpuOpRunner("Conv2DTransposeD",
{input_tensor, *filter},
{output_tensor},
{{"input_size", output_dim_vec},
{"strides", strides},
{"dilations", dilations},
{"output_padding", output_padding},
{"groups", groups},
{"pads", padding},
{"data_format", data_format}});
runner.Run(stream);
}
};
template <typename T>
class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
const phi::DenseTensor* output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
phi::DenseTensor* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
phi::DenseTensor* filter_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const int groups = ctx.Attr<int>("groups");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
auto in_dims = input->dims();
auto filter_dims = filter->dims();
// auto out_grad_dims = output_grad->dims();
// const int batch_size = static_cast<int>(input->dims()[0]);
const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
framework::DDim in_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
std::vector<int> strides_vec(4, 1);
std::vector<int> dilations_vec(4, 1);
phi::DenseTensor input_tensor, output_grad_tensor;
input_tensor.ShareDataWith(*input);
output_grad_tensor.ShareDataWith(*output_grad);
if (channel_last) {
input_tensor.set_layout(DataLayout::kNHWC);
output_grad_tensor.set_layout(DataLayout::kNHWC);
strides_vec[1] = strides[0];
strides_vec[2] = strides[1];
dilations_vec[1] = dilations[0];
dilations_vec[2] = dilations[1];
} else {
strides_vec[2] = strides[0];
strides_vec[3] = strides[1];
dilations_vec[2] = dilations[0];
dilations_vec[3] = dilations[1];
}
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("Conv2DBackpropFilterD",
{output_grad_tensor, input_tensor},
{*filter_grad},
{{"filter_size", phi::vectorize<int>(filter_dims)},
{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor input_grad_tensor;
input_grad_tensor.ShareDataWith(*input_grad);
if (channel_last) {
input_grad_tensor.set_layout(DataLayout::kNHWC);
}
const auto& runner = NpuOpRunner("Conv2D",
{output_grad_tensor, *filter},
{input_grad_tensor},
{{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
}
};
template <typename T>
class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
std::vector<int> output_padding =
ctx.Attr<std::vector<int>>("output_padding");
const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
std::string data_format = ctx.Attr<std::string>("data_format");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
// check dimension
const bool channel_last = data_format == "NHWC";
if (data_format == "NHWC") {
data_format = "NDHWC";
} else {
data_format = "NCDHW";
}
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(
&padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
// construct NPU attr
std::vector<int> strides(5, 1);
std::vector<int> dilations(5, 1);
phi::DenseTensor input_tensor, output_tensor, filter_tensor;
input_tensor.Resize(input->dims());
input_tensor.ShareDataWith(*input);
output_tensor.Resize(output->dims());
output_tensor.ShareDataWith(*output);
filter_tensor.Resize(filter->dims());
filter_tensor.ShareDataWith(*filter);
PADDLE_ENFORCE_EQ(
dilation[0],
1,
platform::errors::InvalidArgument(
"dilation[0] must be equal 1, but received %d.", dilation[0]));
if (channel_last) {
input_tensor.set_layout(DataLayout::kNDHWC);
output_tensor.set_layout(DataLayout::kNDHWC);
strides[1] = stride[0];
strides[2] = stride[1];
strides[3] = stride[2];
dilations[2] = dilation[1];
dilations[3] = dilation[2];
} else {
input_tensor.set_layout(DataLayout::kNCDHW);
output_tensor.set_layout(DataLayout::kNCDHW);
strides[2] = stride[0];
strides[3] = stride[1];
strides[4] = stride[2];
dilations[3] = dilation[1];
dilations[4] = dilation[2];
}
filter_tensor.set_layout(DataLayout::kNCDHW);
auto output_dim_vec = phi::vectorize<int32_t>(output_tensor.dims());
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
NpuOpRunner runner;
runner.SetType("Conv3DBackpropInputD")
.AddInput(filter_tensor)
.AddInput(input_tensor)
.AddAttr("input_size", output_dim_vec)
.AddAttr("strides", strides)
.AddAttr("pads", padding)
.AddAttr("dilations", dilations)
.AddAttr("groups", groups)
.AddAttr("data_format", data_format)
.AddOutput(output_tensor);
runner.Run(dev_ctx.stream());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(conv2d_transpose,
ops::Conv2DTransposeNPUKernel<float>,
ops::Conv2DTransposeNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad,
ops::Conv2DTransposeGradNPUKernel<float>,
ops::Conv2DTransposeGradNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv3d_transpose,
ops::Conv3DTransposeNPUKernel<float>,
ops::Conv3DTransposeNPUKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/crop_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class CropNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
std::vector<int> offset_list;
if (ctx.HasInput("Offsets")) {
auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
paddle::framework::TensorToVector(
*offsets_tensor, ctx.device_context(), &offset_list);
if (offset_list.empty()) {
offset_list.resize(x->dims().size(), 0);
}
} else {
auto res = ctx.Attr<std::vector<int>>("offsets");
if (res.empty()) {
offset_list.resize(x->dims().size(), 0);
} else {
offset_list.insert(offset_list.end(), res.begin(), res.end());
}
}
PADDLE_ENFORCE_EQ(
static_cast<int64_t>(offset_list.size()),
x->dims().size(),
platform::errors::InvalidArgument(
"The shape (%d) of CropOp's "
"'offset' attribute should be equal to the shape of dims "
"(%d) of the Input(X).",
offset_list.size(),
x->dims().size()));
int axis_int = 0;
framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
{"axis", axis_int}};
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
if (ctx.HasInput("Y")) {
auto* shape = ctx.Input<phi::DenseTensor>("Y");
PADDLE_ENFORCE_EQ(shape->dims().size(),
x->dims().size(),
platform::errors::InvalidArgument(
"The shape of dims of (%d) of CropOp's "
"Input(shape) should be equal to the shape of dims "
"(%d) of the Input(X).",
shape->dims().size(),
x->dims().size()));
// shape memory maybe have gc.
phi::DenseTensor tmp_shape(*shape);
tmp_shape.mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
} else {
auto shape_size = ctx.Attr<std::vector<int>>("shape");
PADDLE_ENFORCE_EQ(shape_size.size(),
x->dims().size(),
platform::errors::InvalidArgument(
"The shape of dims of (%d) of CropOp's "
"Input(shape) should be equal to the shape of dims "
"(%d) of the Input(X).",
shape_size.size(),
x->dims().size()));
phi::DenseTensor tmp_shape(x->dtype());
tmp_shape.Resize(phi::make_ddim(shape_size));
tmp_shape.mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
crop,
ops::CropNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::CropNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::CropNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
static void CumsumImp(const phi::DenseTensor& input,
phi::DenseTensor* output,
const framework::NPUAttributeMap& attr_input,
const framework::ExecutionContext& ctx) {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (framework::TransToProtoVarType(input.dtype()) ==
framework::proto::VarType::INT64) {
phi::DenseTensor tmp_input;
tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
auto dst_acl_dtype =
ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
const auto& cast_runner_1 =
NpuOpRunner("Cast",
{input},
{tmp_input},
{{"dst_type", static_cast<int>(dst_acl_dtype)}});
cast_runner_1.Run(stream);
phi::DenseTensor tmp_output;
tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
const auto& runner =
NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
runner.Run(stream);
dst_acl_dtype =
ConvertToNpuDtype(framework::TransToProtoVarType(output->type()));
const auto& cast_runner_2 =
NpuOpRunner("Cast",
{tmp_output},
{*output},
{{"dst_type", static_cast<int>(dst_acl_dtype)}});
cast_runner_2.Run(stream);
} else {
const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
runner.Run(stream);
}
}
template <typename DeviceContext, typename T>
class CumSumNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
int axis = ctx.Attr<int>("axis");
bool exclusive = ctx.Attr<bool>("exclusive");
bool reverse = ctx.Attr<bool>("reverse");
out->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {
{"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
bool flatten = ctx.Attr<bool>("flatten");
if (flatten) {
PADDLE_ENFORCE_EQ(
axis,
-1,
platform::errors::InvalidArgument(
"when flatten is true, attr axis must be default %d, but got %d",
-1,
axis));
phi::DenseTensor new_x(x->type());
new_x.ShareDataWith(*x);
new_x.Resize(phi::make_ddim({x->numel()}));
CumsumImp(new_x, out, attr_input, ctx);
} else {
CumsumImp(*x, out, attr_input, ctx);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
cumsum,
ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
#endif
ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/core/ddim.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class DropoutNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* seed_tensor =
ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto* mask = ctx.Output<phi::DenseTensor>("Mask");
auto dropout_prob = ctx.Attr<float>("dropout_prob");
auto is_test = ctx.Attr<bool>("is_test");
out->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (dropout_prob == 1.) {
const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out});
runner_zeros_out.Run(stream);
mask->mutable_data<uint8_t>(ctx.GetPlace());
const auto& runner_zeros_mask =
NpuOpRunner("ZerosLike", {*mask}, {*mask});
runner_zeros_mask.Run(stream);
return;
}
// only achieve the default `upscale_in_train` method
if (!is_test) {
phi::DenseTensor tmp_x(x->dtype());
phi::DenseTensor tmp_out(out->dtype());
tmp_x.ShareDataWith(*x);
tmp_out.ShareDataWith(*out);
if (x->dims().size() == 1) {
// DropOutDoMask will get error result when input
// is 1-D. Make it become 2-D.
std::vector<int> vec_dim = phi::vectorize<int>(x->dims());
tmp_x.Resize(phi::make_ddim({vec_dim[0], 1}));
tmp_out.Resize(phi::make_ddim({vec_dim[0], 1}));
}
int seed = 0;
int seed2 = 0;
float keep_prob = 1. - dropout_prob;
if (seed_tensor) {
std::vector<int> seed_data;
paddle::framework::TensorToVector(
*seed_tensor, ctx.device_context(), &seed_data);
seed = seed_data[0];
} else {
seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
}
phi::DenseTensor keep_prob_tensor(x->dtype());
keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&keep_prob_tensor,
static_cast<T>(keep_prob));
mask->mutable_data<uint8_t>(ctx.GetPlace());
// mask used in `DropOutGenMask` NPU OP is different from
// the output `Mask`.
phi::DenseTensor npu_mask(phi::DataType::UINT8);
uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
npu_mask.Resize(phi::make_ddim({length / 8}));
npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
// TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
// OP must be a scalar with shape[0]. At present, the shape
// of the `prob` phi::DenseTensor of this OP is forced to be set to 0
// in `npu_op_runner.cc`, which needs to be optimized later.
NpuOpRunner runner_gen_mask;
runner_gen_mask.SetType("DropOutGenMask")
.AddInput(phi::vectorize(tmp_out.dims()))
.AddInput(keep_prob_tensor)
.AddOutput(npu_mask)
.AddAttr("seed", seed)
.AddAttr("seed2", seed2);
runner_gen_mask.Run(stream);
NpuOpRunner runner_dropout;
runner_dropout.SetType("DropOutDoMask")
.AddInput(tmp_x)
.AddInput(npu_mask)
.AddInput(keep_prob_tensor)
.AddOutput(tmp_out);
runner_dropout.Run(stream);
// cast `out` from float/float16 to bool
phi::DenseTensor cast_mask(phi::DataType::BOOL);
cast_mask.Resize(mask->dims());
cast_mask.mutable_data<bool>(ctx.GetPlace());
auto dst_dtype_bool =
ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype()));
const auto& runner_cast_mask_bool =
NpuOpRunner("Cast",
{*out},
{cast_mask},
{{"dst_type", static_cast<int>(dst_dtype_bool)}});
runner_cast_mask_bool.Run(stream);
// cast cast_mask from bool to uint8
auto dst_dtype_uint8 =
ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype()));
const auto& runner_cast_mask_uint8 =
NpuOpRunner("Cast",
{cast_mask},
{*mask},
{{"dst_type", static_cast<int>(dst_dtype_uint8)}});
runner_cast_mask_uint8.Run(stream);
} else {
framework::TensorCopy(
*x,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
out);
}
}
};
template <typename DeviceContext, typename T>
class DropoutGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* mask = ctx.Input<phi::DenseTensor>("Mask");
auto dropout_prob = ctx.Attr<float>("dropout_prob");
auto is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE_EQ(is_test,
false,
platform::errors::PreconditionNotMet(
"GradOp is only callable when is_test is false"));
dx->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (dropout_prob == 1.) {
const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx});
runner_zeros.Run(stream);
return;
}
// cast mask from uint8 to float32/float16
phi::DenseTensor cast_mask(dx->dtype());
cast_mask.Resize(mask->dims());
cast_mask.mutable_data<T>(ctx.GetPlace());
auto dst_dtype =
ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype()));
const auto& runner_cast_mask =
NpuOpRunner("Cast",
{*mask},
{cast_mask},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_mask.Run(stream);
const auto& runner =
NpuOpRunner("MaskedScale",
{*dout, cast_mask},
{*dx},
{{"value", static_cast<float>(1. / (1 - dropout_prob))}});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
dropout,
ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
dropout_grad,
ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/expand_as_v2_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
auto target_shape = context.Attr<std::vector<int>>("target_shape");
auto target_rank = target_shape.size();
PADDLE_ENFORCE_GE(target_rank,
rank,
platform::errors::InvalidArgument(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be greater than or equal to "
"the rank (%d) of the input 'x'.",
target_rank,
rank));
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
"expand_as_v2 op must be positive.",
rank));
PADDLE_ENFORCE_LE(target_rank,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be less than or equal to %d.",
target_rank,
MAX_RANK_SUPPORTED));
ExpandAs(context);
}
protected:
void ExpandAs(const framework::ExecutionContext& context) const {
auto* in0 = context.Input<phi::DenseTensor>("X");
auto in_dims = in0->dims();
auto target_shape = context.Attr<std::vector<int>>("target_shape");
auto vec_in_dims = phi::vectorize<int>(in_dims);
auto diff = target_shape.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
for (size_t i = 0; i < vec_in_dims.size(); ++i) {
PADDLE_ENFORCE_NE(target_shape[i],
0,
platform::errors::InvalidArgument(
"The value of target shape cannot be zero."));
if (vec_in_dims[i] != 1) {
PADDLE_ENFORCE_EQ(
vec_in_dims[i],
target_shape[i],
platform::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in "
"target tensor for expand_as_v2 op.",
vec_in_dims[i],
target_shape[i]));
}
}
auto* out0 = context.Output<phi::DenseTensor>("Out");
framework::DDim out_dims = phi::make_ddim(target_shape);
out0->Resize(out_dims);
out0->mutable_data<T>(context.GetPlace());
const auto& runner =
NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}});
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
expand_as_v2,
ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/expand_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ExpandNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument(
"The number of dimensions of the input 'x' for Op(expand) "
"must be greater than or equal to 1, but the value received is %d.",
rank));
PADDLE_ENFORCE_LE(
rank,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The number of dimensions of the input 'x' for Op(expand) "
"must be less than or equal to %d, but the value received is %d.",
MAX_RANK_SUPPORTED,
rank));
switch (rank) {
case 1:
Expand<1>(context);
break;
case 2:
Expand<2>(context);
break;
case 3:
Expand<3>(context);
break;
case 4:
Expand<4>(context);
break;
case 5:
Expand<5>(context);
break;
case 6:
Expand<6>(context);
break;
}
}
protected:
template <int Rank>
void Expand(const framework::ExecutionContext& context) const {
auto* in0 = context.Input<phi::DenseTensor>("X");
auto in_dims = in0->dims();
auto expand_times = get_expand_times(context);
PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()),
expand_times.size(),
platform::errors::InvalidArgument(
"The number of elements (%d) of 'expand_times' for "
"Op(expand) must be equal to the number "
"of dimensions (%d) of the input.",
expand_times.size(),
static_cast<size_t>(in_dims.size())));
auto* out0 = context.Output<phi::DenseTensor>("Out");
framework::DDim out_dims(in_dims);
for (size_t i = 0; i < expand_times.size(); ++i) {
out_dims[i] *= expand_times[i];
}
auto place = context.GetPlace();
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
out0->Resize(out_dims);
out0->mutable_data<T>(place);
bool is_expand_times_all_one =
(out0->numel() == in0->numel()) ? true : false;
if (is_expand_times_all_one) {
memory::Copy(place,
out0->mutable_data<T>(place),
place,
in0->data<T>(),
in0->numel() * sizeof(T),
stream);
if (out_dims != in_dims) {
out0->Resize(out_dims);
}
} else {
const auto& runner =
NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
expand,
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <iostream>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP(expand);
USE_OP_DEVICE_KERNEL(expand, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto in = scope->Var("X");
auto expand_times = scope->Var("ExpandTimes");
auto out = scope->Var("Out");
auto in_t = in->GetMutable<phi::DenseTensor>();
auto out_t = out->GetMutable<phi::DenseTensor>();
auto expand_times_t = expand_times->GetMutable<phi::DenseTensor>();
auto place = ctx.GetPlace();
paddle::framework::TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
paddle::framework::TensorFromVector(
std::vector<int>({1, 10, 1}), ctx, expand_times_t);
in_t->Resize(phi::make_ddim({3, 1, 7}));
expand_times_t->Resize(phi::make_ddim({3}));
out_t->Resize(phi::make_ddim({3, 10, 7}));
out_t->mutable_data<T>(place);
f::AttributeMap attrs = {{}};
auto op =
f::OpRegistry::CreateOp("expand",
{{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
{{"Out", {"Out"}}},
attrs);
op->Run(*scope, place);
ctx.Wait();
auto out_dim = out_t->dims();
EXPECT_EQ(out_dim.at(0), 3);
EXPECT_EQ(out_dim.at(1), 10);
EXPECT_EQ(out_dim.at(2), 7);
}
TEST(expand, NPU_fp32) {
f::Scope scope;
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/expand_v2_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ExpandV2NPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Out = ctx.Output<phi::DenseTensor>("Out");
auto in_dims = X->dims();
auto expand_shape = get_expand_shape(ctx);
auto vec_in_dims = phi::vectorize<int>(in_dims);
auto diff = expand_shape.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
std::vector<int> final_expand_shape(vec_in_dims.size());
for (size_t i = 0; i < vec_in_dims.size(); ++i) {
PADDLE_ENFORCE_NE(expand_shape[i],
0,
platform::errors::InvalidArgument(
"The expanded size cannot be zero."));
if (i < diff) { // expand_shape = [3,4,-1,-1], X = [10,2] -->
// final_expand_shape = [3,4,10,2]
PADDLE_ENFORCE_GT(
expand_shape[i],
0,
platform::errors::InvalidArgument(
"The expanded size (%d) for non-existing dimensions must be "
"positive for expand_v2 op.",
expand_shape[i]));
final_expand_shape[i] = expand_shape[i];
} else if (expand_shape[i] > 0) { // expand_shape = [3,4,10,4], X =
// [10,1] --> final_expand_shape =
// [3,4,10,4]
if (vec_in_dims[i] != 1) {
PADDLE_ENFORCE_EQ(
vec_in_dims[i],
expand_shape[i],
platform::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in shape for expand_v2 op.",
vec_in_dims[i],
expand_shape[i]));
final_expand_shape[i] = expand_shape[i];
} else {
final_expand_shape[i] = expand_shape[i];
}
} else { // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
// = [3,4,10,2]
PADDLE_ENFORCE_EQ(
expand_shape[i],
-1,
platform::errors::InvalidArgument(
"When the value in shape is negative for expand_v2 op, "
"only -1 is supported, but the value received is %d.",
expand_shape[i]));
final_expand_shape[i] = vec_in_dims[i];
}
}
framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}};
auto rank = X->dims().size();
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument(
"The rank of the input 'X' for expand_v2_npu op must be positive, "
"but the value received is %d.",
rank));
PADDLE_ENFORCE_LE(
rank,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The rank of the input 'X' for expand_v2_npu op must be less than "
"or equal to %d, but the value received is %d.",
MAX_RANK_SUPPORTED,
rank));
auto shape_size = final_expand_shape.size();
PADDLE_ENFORCE_GE(
shape_size,
rank,
platform::errors::InvalidArgument(
"The number (%d) of elements of 'shape' for expand_v2_npu op must "
"be "
"greater than or equal to the rank (%d) of the input 'X'.",
shape_size,
rank));
PADDLE_ENFORCE_LE(shape_size,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The number (%d) of elements of 'shape' for "
"expand_v2_npu op must be "
"less than or equal to %d.",
shape_size,
MAX_RANK_SUPPORTED));
framework::DDim out_dims = phi::make_ddim(final_expand_shape);
Out->Resize(out_dims);
Out->mutable_data<T>(ctx.GetPlace());
const auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
runner.Run(dev_ctx.stream());
};
if (framework::TransToProtoVarType(X->dtype()) ==
framework::proto::VarType::BOOL) {
NpuOpRunner::TypeAdapter({*X},
{*Out},
attr_input,
dev_ctx,
op_func,
{framework::proto::VarType::UINT8},
{framework::proto::VarType::UINT8});
} else if (framework::TransToProtoVarType(X->dtype()) ==
framework::proto::VarType::INT64) {
NpuOpRunner::TypeAdapter({*X},
{*Out},
attr_input,
dev_ctx,
op_func,
{framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
runner.Run(dev_ctx.stream());
}
}
};
template <typename DeviceContext, typename T>
class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// case 1: reduce dout dims to dx dims
// For example: [2, 120] --> [120]
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
phi::DenseTensor tmp_dout(dout->dtype());
phi::DenseTensor reduced_dout(dx->dtype());
tmp_dout.ShareDataWith(*dout);
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
tmp_dout.Resize(phi::make_ddim(reduced_dout_dims));
reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("ReduceSumD",
{*dout},
{reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = reduced_dout;
}
// case 2: reduce axis of dout in which dim is 1
// For example: [12, 140] --> [1, 140]
// case 3: copy dout to dx when shape is totally same, and dim in dx != 1
// For example: [2, 10, 5] --> [2, 10, 5]
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
const auto& runner = NpuOpRunner("ReduceSumD",
{tmp_dout},
{*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
expand_v2,
ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>,
ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, bool>);
REGISTER_OP_NPU_KERNEL(
expand_v2_grad,
ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, float>,
ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>,
ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class EyeNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto num_rows = ctx.Attr<int64_t>("num_rows");
auto d_nums = ctx.Attr<int>("dtype");
auto dtype =
ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(d_nums));
auto num_columns = ctx.Attr<int64_t>("num_columns");
if (num_columns == -1) num_columns = num_rows;
framework::NPUAttributeMap attr_input = {
{"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
eye,
ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::EyeNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename T>
class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
public:
using CommonType = typename std::common_type<
float,
typename std::conditional<std::is_same<T, platform::float16>::value,
float,
T>::type>::type;
void Compute(const framework::ExecutionContext& context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
auto* out = context.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(context.GetPlace());
float value = context.Attr<float>("value");
auto common_type_value = static_cast<CommonType>(value);
PADDLE_ENFORCE_EQ(
(common_type_value >=
static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
(common_type_value <=
static_cast<CommonType>(std::numeric_limits<T>::max())),
true,
platform::errors::InvalidArgument(
"The filled value is out of range for target type, "
"current kernel type is %s, the range should between %f "
"and %f, but now value is %f.",
typeid(T).name(),
static_cast<CommonType>(std::numeric_limits<T>::lowest()),
static_cast<CommonType>(std::numeric_limits<T>::max()),
value));
PADDLE_ENFORCE_EQ(
std::isnan(value),
false,
platform::errors::InvalidArgument("The filled value is NaN."));
Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
tensor_tmp.mutable_data<T>({1}, context.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto shape = out->dims();
NpuOpRunner runner;
runner.SetType("Fill")
.AddInput(phi::vectorize(shape))
.AddInput(tensor_tmp)
.AddOutput(*out)
.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(fill_any_like,
ops::FillAnyLikeNPUKernel<int>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::FillAnyLikeNPUKernel<int64_t>,
#endif
ops::FillAnyLikeNPUKernel<float>,
ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto float_value = ctx.Attr<float>("value");
auto str_value = ctx.Attr<std::string>("str_value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto *out = ctx.Output<phi::DenseTensor>("Out");
auto *in = ctx.Input<phi::DenseTensor>("Input");
if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
// set the correct batch size for the phi::DenseTensor.
auto odims = out->dims();
int output_dim_idx = ctx.Attr<int>("output_dim_idx");
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->mutable_data<T>(odims, ctx.GetPlace());
}
T value;
if (str_value.empty()) {
value = static_cast<T>(float_value);
} else {
// handle NaN/Inf first, which cannot be read from stream.
if (str_value == "inf") {
value = static_cast<T>(std::numeric_limits<double>::infinity());
} else if (str_value == "-inf") {
value = static_cast<T>(-std::numeric_limits<double>::infinity());
} else if (str_value == "nan") {
value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
} else {
std::stringstream convert_stream(str_value);
if (std::is_same<int64_t, T>::value) {
int64_t tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
} else {
double tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
}
}
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
if (cpu_place) {
auto &dev_ctx = *pool.Get(platform::CPUPlace());
phi::funcs::SetConstant<phi::CPUContext, T> functor;
out->mutable_data(platform::CPUPlace(),
framework::TransToPhiDataType(data_type));
functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
out,
static_cast<T>(value));
} else {
out->mutable_data(ctx.GetPlace(),
framework::TransToPhiDataType(data_type));
phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_tmp, value);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto &runner = NpuOpRunner("FillD",
{tensor_tmp},
{*out},
{{"dims", phi::vectorize(out->dims())}});
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpNPUKernel<
paddle::platform::NPUDeviceContext,
float>,
ops::FillConstantBatchSizeLikeOpNPUKernel<
paddle::platform::NPUDeviceContext,
int>,
ops::FillConstantBatchSizeLikeOpNPUKernel<
paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
template <typename T>
class FillConstantNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto str_value = ctx.Attr<std::string>("str_value");
auto float_value = ctx.Attr<float>("value");
auto *out_var = ctx.Output<phi::DenseTensor>("Out");
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
T value;
if (str_value.empty()) {
value = static_cast<T>(float_value);
} else {
// handle NaN/Inf first, which cannot be read from stream.
if (str_value == "inf") {
value = static_cast<T>(std::numeric_limits<double>::infinity());
} else if (str_value == "-inf") {
value = static_cast<T>(-std::numeric_limits<double>::infinity());
} else if (str_value == "nan") {
value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
} else {
std::stringstream convert_stream(str_value);
if (std::is_same<int64_t, T>::value) {
int64_t tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
} else {
double tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
}
}
}
auto shape = GetShape(ctx);
out_var->mutable_data<T>(shape, ctx.GetPlace());
if (data_type != framework::proto::VarType::BOOL) {
Tensor tensor_value(framework::TransToPhiDataType(data_type));
tensor_value.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&tensor_value, value);
NpuOpRunner runner;
runner.SetType("Fill")
.AddInput(phi::vectorize(shape))
.AddInput(tensor_value)
.AddOutput(*out_var)
.Run(stream);
} else {
const auto &dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto op_func = [&shape, &value](
const std::vector<Tensor> &inputs,
const std::vector<Tensor> &outputs,
const NPUAttributeMap &attrs,
const platform::NPUDeviceContext &dev_ctx) {
Tensor tensor_value;
tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
FillNpuTensorWithConstant<uint8_t>(&tensor_value,
static_cast<uint8_t>(value));
NpuOpRunner runner;
runner.SetType("Fill")
.AddInput(phi::vectorize(shape))
.AddInput(tensor_value)
.AddOutput(outputs[0])
.Run(dev_ctx.stream());
};
NpuOpRunner::TypeAdapter({},
{*out_var},
{},
dev_ctx,
op_func,
{},
{framework::proto::VarType::UINT8});
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_NPU_KERNEL(
fill_constant,
paddle::operators::FillConstantNPUKernel<float>,
paddle::operators::FillConstantNPUKernel<bool>,
paddle::operators::FillConstantNPUKernel<int>,
#ifdef PADDLE_WITH_ASCEND_INT64
paddle::operators::FillConstantNPUKernel<int64_t>,
#endif
paddle::operators::FillConstantNPUKernel<paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_zeros_like_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillZerosLikeNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<phi::DenseTensor>("X");
auto* out = context.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
fill_zeros_like,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T>
class GatherNdNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<phi::DenseTensor>("X");
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto *out = ctx.Output<phi::DenseTensor>("Out");
out->template mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0) return;
if (index->numel() == 0) {
framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
return;
}
const auto &index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
platform::errors::InvalidArgument(
"Index holds the wrong type, it holds [%s],"
"but desires to be [%s] or [%s]",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {});
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
runner.Run(stream);
}
};
template <typename T>
class GatherNdGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto *x = ctx.Input<phi::DenseTensor>("X");
auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *p = dx->mutable_data<T>(ctx.GetPlace());
if (dx->numel() == 0) return;
if (index->numel() == 0) {
framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx);
return;
}
phi::DenseTensor tmp_tensor(index->type());
phi::DenseTensor tmp_tensor2(dout->type());
const auto index_dims = index->dims();
if (index_dims.size() == 1) {
tmp_tensor.ShareDataWith(*index);
std::vector<int64_t> new_dim = {1, index_dims[0]};
tmp_tensor.Resize(phi::make_ddim(new_dim));
index = &tmp_tensor;
tmp_tensor2.ShareDataWith(*dout);
std::vector<int64_t> new_dim2{1};
for (int i = index->numel(); i < x->dims().size(); i++) {
new_dim2.push_back(x->dims()[i]);
}
tmp_tensor2.Resize(phi::make_ddim(new_dim2));
dout = &tmp_tensor2;
}
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
platform::NPUMemsetAsync(
static_cast<void *>(p), 0, dx->numel() * sizeof(T), stream);
const auto &runner_scatter = NpuOpRunner(
"ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}});
runner_scatter.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(gather_nd,
ops::GatherNdNPUKernel<paddle::platform::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::GatherNdNPUKernel<int64_t>,
#endif
ops::GatherNdNPUKernel<float>);
REGISTER_OP_NPU_KERNEL(gather_nd_grad,
ops::GatherNdGradNPUKernel<paddle::platform::float16>,
ops::GatherNdGradNPUKernel<float>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/is_empty_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
is_empty,
ops::IsEmptyOpKernel<plat::NPUDeviceContext, float>,
ops::IsEmptyOpKernel<plat::NPUDeviceContext, plat::float16>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册