diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 1d6e75c4b5d71c67f4c1d30161627d45a3c9fc34..a30909322eccfe9333ed3ab308d1b136ce623b58 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -96,7 +96,7 @@ register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 genera recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS}) -op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) target_link_libraries(run_program_op cuda_graph_with_memory_pool) op_library(quantize_linear_op DEPS phi) op_library(save_combine_op DEPS string_array phi) diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc deleted file mode 100644 index 0a859d1f564a915e1a4eafab8098abe1dafd8514..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/abs_op_npu.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AbsNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Abs", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class AbsGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - abs, - ops::AbsNPUKernel, - ops::AbsNPUKernel); - -REGISTER_OP_NPU_KERNEL( - abs_grad, - ops::AbsGradNPUKernel, - ops::AbsGradNPUKernel); diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc deleted file mode 100644 index 9f3392f2eabc576b1fcb60de9ba0cd409f4729ce..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/activation_op_npu.cc +++ /dev/null @@ -1,1116 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include -#include - -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class PowNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto factor = ctx.Attr("factor"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Power", - {*x}, - {*out}, - {{"power", factor}, - {"scale", static_cast(1.0)}, - {"shift", static_cast(0.0)}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class PowGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto factor = ctx.Attr("factor"); - - auto x_dims = x->dims(); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(liym27): dx = dout * factor * x.pow(factor-1) - - // Step1: Compute x_pow = x.pow(factor-1) - phi::DenseTensor x_pow(x->type()); - x_pow.mutable_data(x->dims(), place); - const auto& runner_pow = NpuOpRunner( - "Power", {*x}, {x_pow}, {{"power", factor - static_cast(1)}}); - runner_pow.Run(stream); - - // Step 2: Construct a broadcast factor, which has the same shape with x. - - // 2.1 Get a factor tensor with shape [1]. - phi::DenseTensor factor_tensor(phi::DataType::FLOAT32); - factor_tensor.mutable_data({1}, place); - FillNpuTensorWithConstant(&factor_tensor, factor); - - // 2.2 Get the factor which has the shape with x and the same value with - // factor. - phi::DenseTensor factor_bc_tensor(phi::DataType::FLOAT32); - factor_bc_tensor.mutable_data(x_dims, place); - const auto& runner_bc = NpuOpRunner("FillD", - {factor_tensor}, - {factor_bc_tensor}, - {{"dims", phi::vectorize(x_dims)}}); - runner_bc.Run(stream); - - // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) - phi::DenseTensor x_power_mul_factor(x->type()); - x_power_mul_factor.mutable_data(x->dims(), place); - const auto& runner_mul_1 = - NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); - runner_mul_1.Run(stream); - - // Step 4: Compute dx = dout * factor * x.pow(factor-1) - dx->mutable_data(place); - const auto& runner_mul_2 = - NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); - runner_mul_2.Run(stream); - } -}; - -template -class ReluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Relu", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ReluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); - - runner.Run(stream); - } -}; - -template -class Relu6NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Relu6", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class Relu6GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {}); - - runner.Run(stream); - } -}; - -template -class SqrtNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class LeakyReluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto alpha = ctx.Attr("alpha"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("LeakyRelu", {*x}, {*out}, {{"negative_slope", alpha}}); - runner.Run(stream); - } -}; - -template -class LeakyReluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto alpha = ctx.Attr("alpha"); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "LeakyReluGrad", {*dout, *x}, {*dx}, {{"negative_slope", alpha}}); - - runner.Run(stream); - } -}; - -template -class SqrtGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class LogNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor one(x->type()); - one.mutable_data(x->dims(), place); - const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {}); - runner_one.Run(stream); - - phi::DenseTensor sub(x->type()); - sub.mutable_data(x->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {}); - runner_sub.Run(stream); - - const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {}); - runner_out.Run(stream); - } -}; - -template -class LogGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); - runner.Run(stream); - } -}; - -template -class TanhNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class TanhGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class SquareNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SquareGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto factor = static_cast(2.0); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - // Step 1: Compute x_muls_factor = factor * x - phi::DenseTensor x_muls_factor(x->type()); - x_muls_factor.mutable_data(x->dims(), place); - const auto& runner_muls_1 = - NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}}); - runner_muls_1.Run(stream); - - // Step 2: Compute dx = dout * factor * x - dx->mutable_data(place); - const auto& runner_mul_2 = - NpuOpRunner("Mul", {*dout, x_muls_factor}, {*dx}, {}); - runner_mul_2.Run(stream); - } -}; - -template -class SigmoidNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sigmoid", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SigmoidGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = - NpuOpRunner("SigmoidGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -// Swish = x * sigmoid(beta * x) -template -class SwishNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - float beta = ctx.Attr("beta"); - - out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - - const auto& muls_runner = - NpuOpRunner("Muls", {*x}, {*out}, {{"value", beta}}); - muls_runner.Run(stream); - - const auto& sigmoid_runner = NpuOpRunner("Sigmoid", {*out}, {*out}, {}); - sigmoid_runner.Run(stream); - - const auto& mul_runner = NpuOpRunner("Mul", {*x, *out}, {*out}); - mul_runner.Run(stream); - } -}; - -template -class SwishGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - float beta = ctx.Attr("beta"); - - dx->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor beta_x, sigmoid_out, swish_out; - beta_x.mutable_data(x->dims(), ctx.GetPlace()); - sigmoid_out.mutable_data(x->dims(), ctx.GetPlace()); - swish_out.mutable_data(x->dims(), ctx.GetPlace()); - const auto& muls_runner = - NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}}); - muls_runner.Run(stream); - - const auto& sigmoid_runner = - NpuOpRunner("Sigmoid", {beta_x}, {sigmoid_out}, {}); - sigmoid_runner.Run(stream); - - const auto& mul_runner = - NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {}); - mul_runner.Run(stream); - const auto& muls_runner2 = - NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}}); - muls_runner2.Run(stream); - - const auto& mul_runner1 = - NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {}); - mul_runner1.Run(stream); - - const auto& sub_runner = NpuOpRunner("Sub", {swish_out, *dx}, {*dx}, {}); - sub_runner.Run(stream); - - const auto& add_runner = NpuOpRunner("Add", {sigmoid_out, *dx}, {*dx}, {}); - add_runner.Run(stream); - - const auto& mul_runner2 = NpuOpRunner("Mul", {*dout, *dx}, {*dx}, {}); - mul_runner2.Run(stream); - } -}; - -// HardSwish = min(max(0, x+offset), threshold) * x / scale -template -class HardSwishNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - float threshold = ctx.Attr("threshold"); - float scale = ctx.Attr("scale"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_offset(x->type()); - tensor_offset.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); - - phi::DenseTensor add_offset_val(x->type()); - add_offset_val.mutable_data(x->dims(), place); - const auto& runner_add = - NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); - runner_add.Run(stream); - - phi::DenseTensor tensor_threshold(x->type()); - tensor_threshold.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_threshold, static_cast(threshold)); - - phi::DenseTensor tensor_zero(x->type()); - tensor_zero.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_zero, static_cast(0.0)); - - phi::DenseTensor clip_val(x->type()); - clip_val.mutable_data(x->dims(), place); - const auto& runner_clip = - NpuOpRunner("ClipByValue", - {add_offset_val, tensor_zero, tensor_threshold}, - {clip_val}); - runner_clip.Run(stream); - - phi::DenseTensor tensor_scale_tmp(x->type()); - tensor_scale_tmp.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_scale_tmp, static_cast(scale)); - phi::DenseTensor tensor_scale(x->type()); - tensor_scale.mutable_data(x->dims(), place); - const auto& runner_fill = - NpuOpRunner("FillD", - {tensor_scale_tmp}, - {tensor_scale}, - {{"dims", phi::vectorize(x->dims())}}); - runner_fill.Run(stream); - - phi::DenseTensor div_val(x->type()); - div_val.mutable_data(x->dims(), place); - const auto& runner_div = - NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val}); - runner_div.Run(stream); - - const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out}); - runner_mul.Run(stream); - } -}; - -template -class HardSwishGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - float threshold = ctx.Attr("threshold"); - float scale = ctx.Attr("scale"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_offset(x->type()); - tensor_offset.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); - - phi::DenseTensor add_offset_val(x->type()); - add_offset_val.mutable_data(x->dims(), place); - const auto& runner_add = - NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); - runner_add.Run(stream); - - phi::DenseTensor tmp1(x->type()); - tmp1.mutable_data(x->dims(), place); - const auto& runner_pow1 = NpuOpRunner( - "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}}); - runner_pow1.Run(stream); - - phi::DenseTensor tmp2(x->type()); - tmp2.mutable_data(x->dims(), place); - const auto& runner_ht_grad = - NpuOpRunner("HardtanhGrad", - {add_offset_val, tmp1}, - {tmp2}, - {{"min_val", 0.0f}, {"max_val", threshold}}); - runner_ht_grad.Run(stream); - - phi::DenseTensor tmp3(x->type()); - tmp3.mutable_data(x->dims(), place); - const auto& runner_pow2 = NpuOpRunner( - "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}}); - runner_pow2.Run(stream); - - phi::DenseTensor tensor_threshold_tmp(x->type()); - tensor_threshold_tmp.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_threshold_tmp, - static_cast(threshold)); - phi::DenseTensor tensor_threshold(x->type()); - tensor_threshold.mutable_data(x->dims(), place); - const auto& runner_fill = - NpuOpRunner("FillD", - {tensor_threshold_tmp}, - {tensor_threshold}, - {{"dims", phi::vectorize(x->dims())}}); - runner_fill.Run(stream); - - phi::DenseTensor tmp_bool(phi::DataType::BOOL); - tmp_bool.mutable_data(x->dims(), place); - const auto& runner_less = - NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool}); - runner_less.Run(stream); - phi::DenseTensor tmp4(x->type()); - tmp4.mutable_data(x->dims(), place); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast = - NpuOpRunner("Cast", - {tmp_bool}, - {tmp4}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - - phi::DenseTensor tmp5(x->type()); - tmp5.mutable_data(x->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5}); - runner_sub.Run(stream); - - const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx}); - runner_final.Run(stream); - } -}; - -template -class HardSigmoidNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - float slope = ctx.Attr("slope"); - float offset = ctx.Attr("offset"); - - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {{"alpha", slope}, - {"beta", offset}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("HardSigmoid", {*x}, {*out}, attr_input); - runner.Run(stream); - } -}; - -template -class HardSigmoidGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - float slope = ctx.Attr("slope"); - float offset = ctx.Attr("offset"); - - dx->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {{"alpha", slope}, - {"beta", offset}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = - NpuOpRunner("HardSigmoidGrad", {*dout, *out}, {*dx}, attr_input); - runner_dx.Run(stream); - } -}; - -template -class ReciprocalNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - out->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Reciprocal", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class ReciprocalGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto place = ctx.GetPlace(); - dx->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_dx = - NpuOpRunner("ReciprocalGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class CosNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class CosGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - dx->mutable_data(place); - - phi::DenseTensor sin_out(x->type()); // Temporary phi::DenseTensor - sin_out.Resize(x->dims()); - sin_out.mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {}); - runner.Run(stream); - - const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {}); - runner_dx.Run(stream); - - phi::DenseTensor tmp(x->type()); // Temporary phi::DenseTensor - tmp.Resize(phi::make_ddim({1, 1})); - tmp.mutable_data(place); - float factor = -1.; - FillNpuTensorWithConstant(&tmp, static_cast(factor)); - - const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {}); - runner_dx_.Run(stream); - // dx = -dout * Sine(x); - } -}; - -template -class AtanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - out->mutable_data(place); - const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class AtanGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto place = ctx.GetPlace(); - dx->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class ExpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ExpGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Mul", {*dout, *out}, {*dx}, {}); - runner.Run(stream); - } -}; - -template -class SinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sin", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - pow, - ops::PowNPUKernel, - ops::PowNPUKernel); - -REGISTER_OP_NPU_KERNEL( - pow_grad, - ops::PowGradNPUKernel, - ops::PowGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu, - ops::ReluNPUKernel, - ops::ReluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu_grad, - ops::ReluGradNPUKernel, - ops::ReluGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu6, - ops::Relu6NPUKernel, - ops::Relu6NPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu6_grad, - ops::Relu6GradNPUKernel, - ops::Relu6GradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - leaky_relu, - ops::LeakyReluNPUKernel, - ops::LeakyReluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - leaky_relu_grad, - ops::LeakyReluGradNPUKernel, - ops::LeakyReluGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sqrt, - ops::SqrtNPUKernel, - ops::SqrtNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sqrt_grad, - ops::SqrtGradNPUKernel, - ops::SqrtGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - log, - ops::LogNPUKernel, - ops::LogNPUKernel); - -REGISTER_OP_NPU_KERNEL( - log_grad, - ops::LogGradNPUKernel, - ops::LogGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - tanh, - ops::TanhNPUKernel, - ops::TanhNPUKernel); - -REGISTER_OP_NPU_KERNEL( - tanh_grad, - ops::TanhGradNPUKernel, - ops::TanhGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - square, - ops::SquareNPUKernel, - ops::SquareNPUKernel, - ops::SquareNPUKernel); - -REGISTER_OP_NPU_KERNEL( - square_grad, - ops::SquareGradNPUKernel, - ops::SquareNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sigmoid, - ops::SigmoidNPUKernel, - ops::SigmoidNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sigmoid_grad, - ops::SigmoidGradNPUKernel, - ops::SigmoidGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(swish, - ops::SwishNPUKernel, - ops::SwishNPUKernel); - -REGISTER_OP_NPU_KERNEL(swish_grad, - ops::SwishGradNPUKernel, - ops::SwishGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(hard_swish, - ops::HardSwishNPUKernel, - ops::HardSwishNPUKernel); - -REGISTER_OP_NPU_KERNEL(hard_swish_grad, - ops::HardSwishGradNPUKernel, - ops::HardSwishGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - hard_sigmoid, - ops::HardSigmoidNPUKernel, - ops::HardSigmoidNPUKernel); - -REGISTER_OP_NPU_KERNEL( - hard_sigmoid_grad, - ops::HardSigmoidGradNPUKernel, - ops::HardSigmoidGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - reciprocal, - ops::ReciprocalNPUKernel, - ops::ReciprocalNPUKernel, - ops::ReciprocalNPUKernel); - -REGISTER_OP_NPU_KERNEL( - reciprocal_grad, - ops::ReciprocalGradNPUKernel, - ops::ReciprocalGradNPUKernel, - ops::ReciprocalGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - cos, - ops::CosNPUKernel, - ops::CosNPUKernel); - -REGISTER_OP_NPU_KERNEL( - cos_grad, - ops::CosGradNPUKernel, - ops::CosGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - atan, - ops::AtanNPUKernel, - ops::AtanNPUKernel); - -REGISTER_OP_NPU_KERNEL( - atan_grad, - ops::AtanGradNPUKernel, - ops::AtanGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - exp, - ops::ExpNPUKernel, - ops::ExpNPUKernel); - -REGISTER_OP_NPU_KERNEL( - exp_grad, - ops::ExpGradNPUKernel, - ops::ExpGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sin, - ops::SinNPUKernel, - ops::SinNPUKernel, - ops::SinNPUKernel); diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc deleted file mode 100644 index 18915ee4f3d79b02d7a948cd962ae8e07e93b9f8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ /dev/null @@ -1,286 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void TranposeNPU(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - std::vector* perm, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(in) - .AddInput(std::move(*perm)) - .AddOutput(*out) - .Run(stream); -} - -static void CastToInt64(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_INT64) - .Run(stream); -} - -static void CastToFP32(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT) - .Run(stream); -} - -template -class ArgsortNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - auto stream = ctx.template device_context().stream(); - framework::NPUAttributeMap attr = {{"axis", -1}, - {"descending", descending}}; - - phi::DenseTensor indices_tmp(phi::DataType::INT32); - indices_tmp.Resize(indices->dims()); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::INT64) { - phi::DenseTensor input_fp32(phi::DataType::FLOAT32); - input_fp32.Resize(input->dims()); - CastToFP32(ctx, stream, *input, &input_fp32); - - phi::DenseTensor output_fp32(phi::DataType::FLOAT32); - output_fp32.Resize(output->dims()); - - if (axis == -1 || axis + 1 == in_dims.size()) { - output_fp32.mutable_data(ctx.GetPlace()); - indices_tmp.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr); - runner.Run(stream); - - CastToInt64(ctx, stream, output_fp32, output); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_input(input_fp32.type()); - trans_input.Resize(trans_dims); - TranposeNPU(ctx, stream, &perm, input_fp32, &trans_input); - - phi::DenseTensor trans_output(input_fp32.type()); - phi::DenseTensor trans_indices(phi::DataType::INT32); - trans_output.mutable_data(trans_dims, ctx.GetPlace()); - trans_indices.mutable_data(trans_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "Sort", {trans_input}, {trans_output, trans_indices}, attr); - runner.Run(stream); - - TranposeNPU(ctx, stream, &perm, trans_output, &output_fp32); - TranposeNPU(ctx, stream, &perm, trans_indices, &indices_tmp); - - CastToInt64(ctx, stream, output_fp32, output); - } - } else { - if (axis == -1 || axis + 1 == in_dims.size()) { - output->mutable_data(ctx.GetPlace()); - indices_tmp.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr); - runner.Run(stream); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_input(input->type()); - trans_input.Resize(trans_dims); - TranposeNPU(ctx, stream, &perm, *input, &trans_input); - - phi::DenseTensor trans_output(input->type()); - phi::DenseTensor trans_indices(phi::DataType::INT32); - trans_output.mutable_data(trans_dims, ctx.GetPlace()); - trans_indices.mutable_data(trans_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "Sort", {trans_input}, {trans_output, trans_indices}, attr); - runner.Run(stream); - - TranposeNPU(ctx, stream, &perm, trans_output, output); - TranposeNPU(ctx, stream, &perm, trans_indices, &indices_tmp); - } - } - - CastToInt64(ctx, stream, indices_tmp, indices); - } -}; - -template -static void FullAssignNPU(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const framework::DDim in_dims, - const phi::DenseTensor& input, - const phi::DenseTensor& indices, - phi::DenseTensor* t_out) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - phi::DenseTensor input_tmp; - input_tmp.ShareDataWith(input); - input_tmp.Resize( - phi::make_ddim(std::vector{input_height * input_width})); - - phi::DenseTensor indices_tmp; - indices_tmp.ShareDataWith(indices); - indices_tmp.Resize( - phi::make_ddim(std::vector{input_height, input_width})); - - std::vector indexs_value; - for (Type i = 0; i < input_height; i++) { - indexs_value.push_back(i * input_width); - } - phi::DenseTensor indexs_tmp(indices.type()); - framework::TensorFromVector( - indexs_value, ctx.device_context(), &indexs_tmp); - indexs_tmp.Resize(phi::make_ddim(std::vector{input_height, 1})); - - phi::DenseTensor indices_index(indices.type()); - indices_index.mutable_data(indices_tmp.dims(), ctx.GetPlace()); - const auto& runner_add = - NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {}); - runner_add.Run(stream); - - indices_index.Resize( - phi::make_ddim(std::vector{input_height * input_width})); - - t_out->mutable_data(ctx.GetPlace()); - phi::DenseTensor out_tmp(t_out->type()); - out_tmp.ShareDataWith(*t_out); - - const auto& runner = NpuOpRunner("TensorScatterUpdate", - {input_tmp, indices_index, input_tmp}, - {out_tmp}, - {}); - runner.Run(stream); -} - -template -class ArgsortGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (dO->numel() == 0) return; - - auto stream = ctx.template device_context().stream(); - - if (axis == -1 || axis + 1 == in_dims.size()) { - FullAssignNPU(ctx, stream, in_dims, *dO, *indices, dX); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_dout(dO->type()); - phi::DenseTensor trans_ids(indices->type()); - trans_dout.Resize(trans_dims); - trans_ids.Resize(trans_dims); - - TranposeNPU(ctx, stream, &perm, *dO, &trans_dout); - TranposeNPU(ctx, stream, &perm, *indices, &trans_ids); - - phi::DenseTensor trans_dx(dO->type()); - trans_dx.Resize(trans_dims); - FullAssignNPU( - ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx); - - TranposeNPU(ctx, stream, &perm, trans_dx, dX); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(argsort, - ops::ArgsortNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ArgsortNPUKernel, -#endif - ops::ArgsortNPUKernel); - -REGISTER_OP_NPU_KERNEL(argsort_grad, - ops::ArgsortGradNPUKernel, - ops::ArgsortGradNPUKernel); diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc deleted file mode 100644 index ff88427c123368b893cc115ce96a119a5902cdf9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/assign_op_npu.cc +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/assign_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace framework { -class OpDesc; -class Variable; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -} // namespace paddle - -namespace paddle { -namespace operators { -template -class AssignNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - assign, - ops::AssignNPUKernel, - ops::AssignNPUKernel, - ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc deleted file mode 100644 index 25d8d07802ad1ea85e0c042d18c1678e185a47e2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(assign); -USE_OP_DEVICE_KERNEL(assign, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - init.push_back(static_cast(1.0)); - init.push_back(static_cast(2.0)); - init.push_back(static_cast(3.0)); - init.push_back(static_cast(4.0)); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({4}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - auto op = - f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); - EXPECT_EQ(out_vec[0], static_cast(1.0)); - EXPECT_EQ(out_vec[1], static_cast(2.0)); - EXPECT_EQ(out_vec[2], static_cast(3.0)); - EXPECT_EQ(out_vec[3], static_cast(4.0)); -} - -TEST(assign, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "assign"); -} diff --git a/paddle/fluid/operators/assign_value_op_npu.cc b/paddle/fluid/operators/assign_value_op_npu.cc deleted file mode 100644 index 5354f26d6fa73a6a315b2766d9fb58481cbf2285..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/assign_value_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/assign_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(assign_value, - ops::AssignValueKernel, - ops::AssignValueKernel, - ops::AssignValueKernel, - ops::AssignValueKernel); diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc deleted file mode 100644 index 15774d5712fff4dc0ebe4a7c7ba48f4136c3cd6d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ /dev/null @@ -1,261 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/batch_norm_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class NPUBatchNormOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - - bool test_mode = is_test && (!trainable_stats); - bool training = !test_mode && !use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - (x_dims.size() == 4UL || x_dims.size() == 3UL), - true, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 3 or 4. " - " But got X's shape = [%s], X's dimension = [%d].", - x_dims.to_str(), - x_dims.size())); - - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - auto x_tensor = - ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto y_tesnor = - ctx.AllocateTmpTensor(y->dims(), dev_ctx); - x_tensor.ShareDataWith(*x); - y_tesnor.ShareDataWith(*y); - if (data_layout == DataLayout::kNHWC) { - x_tensor.set_layout(DataLayout::kNHWC); - y_tesnor.set_layout(DataLayout::kNHWC); - } - - auto stream = ctx.template device_context().stream(); - if (!training) { - const auto &runner_infer = - NpuOpRunner("BNInfer", - {x_tensor, *scale, *bias, *running_mean, *running_var}, - {y_tesnor}, - {{"epsilon", epsilon}}); - runner_infer.Run(stream); - } else { - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - phi::DenseTensor mom_cpu; - paddle::framework::TensorCopySync( - *mom_tensor, platform::CPUPlace(), &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - phi::DenseTensor sum, square_sum; - sum.mutable_data(running_mean->dims(), ctx.GetPlace()); - square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); - - // BNTrainingReduce ONLY support rank = 4 - if (x->dims().size() == 3) { - auto x_shape_vec = phi::vectorize(x->dims()); - if (data_layout == DataLayout::kNCHW) { - x_shape_vec.push_back(1); // expand NCL -> NCL1 - } else { - x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C - } - auto x_new_shape = phi::make_ddim(x_shape_vec); - x_tensor.Resize(x_new_shape); - x_tensor.Resize(x_new_shape); - } - const auto &runner_reduce = NpuOpRunner("BNTrainingReduce", - {x_tensor}, - {sum, square_sum}, - {{"epsilon", epsilon}}); - runner_reduce.Run(stream); - - const auto &runner_update = NpuOpRunner( - "BNTrainingUpdate", - {x_tensor, - sum, - square_sum, - *scale, - *bias, - *running_mean, - *running_var}, - {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance}, - {{"factor", momentum}, {"epsilon", epsilon}}); - runner_update.Run(stream); - } - } -}; - -template -class NPUBatchNormGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = - ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - auto &dev_ctx = ctx.template device_context(); - auto x_tensor = - ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto dy_tensor = - ctx.AllocateTmpTensor(d_y->dims(), dev_ctx); - x_tensor.ShareDataWith(*x); - dy_tensor.ShareDataWith(*d_y); - if (data_layout == DataLayout::kNHWC) { - x_tensor.set_layout(DataLayout::kNHWC); - dy_tensor.set_layout(DataLayout::kNHWC); - } - - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); - auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); - if (d_scale == nullptr) { - d_scale = &scale_grad_tmp; - } - if (d_bias == nullptr) { - d_bias = &bias_grad_tmp; - } - - auto stream = ctx.template device_context().stream(); - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - const auto &runner_update = - NpuOpRunner("BNTrainingUpdateGrad", - {dy_tensor, x_tensor, *running_mean, *running_variance}, - {*d_scale, *d_bias}, - {{"epsilon", epsilon}}); - runner_update.Run(stream); - } else { - const auto &runner_update = - NpuOpRunner("BNTrainingUpdateGrad", - {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance}, - {*d_scale, *d_bias}, - {{"epsilon", epsilon}}); - runner_update.Run(stream); - } - } - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - auto dx_tensor = - ctx.AllocateTmpTensor(d_x->dims(), dev_ctx); - dx_tensor.ShareDataWith(*d_x); - if (data_layout == DataLayout::kNHWC) { - dx_tensor.set_layout(DataLayout::kNHWC); - } - if (use_global_stats) { - if (x->dims().size() == 3) { - // BNInferGrad only support x rank = 4, - auto x_shape_vec = phi::vectorize(d_x->dims()); - if (data_layout == DataLayout::kNCHW) { - x_shape_vec.push_back(1); // expand NCL -> NCL1 - } else { - x_shape_vec.insert(x_shape_vec.begin() + 2, - 1); // expand NLC -> NL1C - } - auto x_new_shape = phi::make_ddim(x_shape_vec); - dx_tensor.Resize(x_new_shape); - dy_tensor.Resize(x_new_shape); - } - const auto *running_var = ctx.Input("Variance"); - const auto &runner_infer = - NpuOpRunner("BNInferGrad", - {dy_tensor, *scale, *running_var}, - {dx_tensor}, - {{"epsilon", epsilon}}); - runner_infer.Run(stream); - } else { - const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad", - {dy_tensor, - x_tensor, - *d_scale, - *d_bias, - *scale, - *saved_mean, - *saved_inv_variance}, - {dx_tensor}, - {{"epsilon", epsilon}}); - runner_reduce.Run(stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(batch_norm, - ops::NPUBatchNormOpKernel, - ops::NPUBatchNormOpKernel); -REGISTER_OP_NPU_KERNEL(batch_norm_grad, - ops::NPUBatchNormGradOpKernel, - ops::NPUBatchNormGradOpKernel); diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc deleted file mode 100644 index ed8872d90ef6f5a2bd93b890909bcf26a01d6257..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bce_loss_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class BCELossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("BinaryCrossEntropy", - {*x, *labels}, - {*out}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); - } -}; - -template -class BCELossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("BinaryCrossEntropyGrad", - {*x, *labels, *dout}, - {*dx}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - bce_loss, - ops::BCELossNPUKernel, - ops::BCELossNPUKernel); - -REGISTER_OP_NPU_KERNEL( - bce_loss_grad, - ops::BCELossGradNPUKernel, - ops::BCELossGradNPUKernel); diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc deleted file mode 100644 index 147d1be2262556359d8d3e3581bd1bbabb1c156a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/beam_search_op_npu.cc +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/beam_search_op.h" - -namespace ops = paddle::operators; -using NPUCtx = paddle::platform::NPUDeviceContext; - -REGISTER_OP_NPU_KERNEL(beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc deleted file mode 100644 index 411e112318d12c05da23811ff9f6eaf00da9324d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cast_op_npu.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -namespace paddle { -namespace operators { - -static std::map - DTYPE_2_ACL_DTYPE = { - {framework::proto::VarType::BOOL, ACL_BOOL}, - {framework::proto::VarType::INT16, ACL_INT16}, - {framework::proto::VarType::INT32, ACL_INT32}, - {framework::proto::VarType::INT64, ACL_INT64}, - {framework::proto::VarType::FP16, ACL_FLOAT16}, - {framework::proto::VarType::FP32, ACL_FLOAT}, - {framework::proto::VarType::FP64, ACL_DOUBLE}, -}; - -template -class CastNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int dtype = ctx.Attr("out_dtype"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - - if (framework::TransToProtoVarType(x->dtype()) == dtype) { - // NOTE(zhiqiu): NPU cast op may result in wrong value, so - // add special case here. - VLOG(4) << "cast to same dtype:" << dtype; - out->mutable_data(place, x->type()); - framework::TensorCopy( - *x, - ctx.GetPlace(), - ctx.template device_context(), - out); - return; - } - - auto iter = DTYPE_2_ACL_DTYPE.find( - static_cast(dtype)); - int aclDtype = iter->second; - - if (dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*out}, {{"dst_type", static_cast(aclDtype)}}); - runner.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - cast, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc deleted file mode 100644 index f22f58d1769ea143810f503b00d844e2ec0768e9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/clip_by_norm_op_npu.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/clip_by_norm_op.h" - -namespace paddle { -namespace operators { - -template -class NPUClipByNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto max_norm = context.Attr("max_norm"); - auto in_var = context.InputVar("X"); - - if (!(in_var->IsType())) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input variable type, only support LodTensor" - "type, but got type is %s.", - framework::ToTypeName(in_var->Type()))); - } - - auto place = context.GetPlace(); - auto& dev_ctx = - context.template device_context(); - auto stream = dev_ctx.stream(); - - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(place); - - PADDLE_ENFORCE_NOT_NULL(input, - platform::errors::InvalidArgument( - "Input(X) of ClipByNormOp should not be null. " - "Please check if it is created correctly.")); - - phi::DenseTensor square_sum(input->type()); - square_sum.mutable_data(framework::DDim({1}), place); - const auto& x_dims = input->dims(); - std::vector axis; - for (int i = 0; i < x_dims.size(); ++i) { - axis.push_back(i); - } - const auto& square_sum_runner = - NpuOpRunner("SquareSumV1", - {*input}, - {square_sum}, - {{"axis", axis}, {"keep_dims", false}}); - square_sum_runner.Run(stream); - - phi::DenseTensor x_norm(input->type()); - x_norm.mutable_data(framework::DDim({1}), place); - const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {}); - x_norm_runner.Run(stream); - - phi::DenseTensor x_norm_t; - framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t); - auto x_norm_v = static_cast(*x_norm_t.data()); - if (x_norm_v <= max_norm) { - framework::TensorCopy(*input, place, dev_ctx, output); - } else { - auto epsilon = x_norm_v <= static_cast(1e-30) - ? static_cast(1e-6) - : static_cast(0); - float scaling = max_norm / (x_norm_v + epsilon); - const auto& muls_runner = - NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}}); - muls_runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - clip_by_norm, - ops::NPUClipByNormKernel, - ops::NPUClipByNormKernel); diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc deleted file mode 100644 index 8977bd250e868544aff8c018d59279140ac8e933..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/clip_op_npu.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ClipNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto min_tensor = - ctx.HasInput("Min") ? ctx.Input("Min") : nullptr; - auto max_tensor = - ctx.HasInput("Max") ? ctx.Input("Max") : nullptr; - - phi::DenseTensor min_tensor_temp(x->type()); - phi::DenseTensor max_tensor_temp(x->type()); - if (min_tensor == nullptr) { - auto min_value = static_cast(ctx.Attr("min")); - min_tensor_temp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&min_tensor_temp, min_value); - min_tensor = &min_tensor_temp; - } - - if (max_tensor == nullptr) { - auto max_value = static_cast(ctx.Attr("max")); - max_tensor_temp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&max_tensor_temp, max_value); - max_tensor = &max_tensor_temp; - } - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class ClipGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - auto* min_tensor = - ctx.HasInput("Min") ? ctx.Input("Min") : nullptr; - auto* max_tensor = - ctx.HasInput("Max") ? ctx.Input("Max") : nullptr; - - auto min_val = ctx.Attr("min"); - if (min_tensor) { - phi::DenseTensor min_data; - framework::TensorCopy( - *min_tensor, - platform::CPUPlace(), - ctx.template device_context(), - &min_data); - ctx.template device_context().Wait(); - min_val = static_cast(min_data.data()[0]); - } - - auto max_val = ctx.Attr("max"); - if (max_tensor) { - phi::DenseTensor max_data; - framework::TensorCopy( - *max_tensor, - platform::CPUPlace(), - ctx.template device_context(), - &max_data); - ctx.template device_context().Wait(); - max_val = static_cast(max_data.data()[0]); - } - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("HardtanhGrad", - {*x, *dout}, - {*dx}, - {{"min_val", min_val}, {"max_val", max_val}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - clip, - ops::ClipNPUKernel, - ops::ClipNPUKernel); - -REGISTER_OP_NPU_KERNEL( - clip_grad, - ops::ClipGradNPUKernel, - ops::ClipGradNPUKernel); diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc deleted file mode 100644 index 491d44efa7261ea1362b69fe012fa939e4ad3f77..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concat_op_npu.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/concat_op.h" - -namespace paddle { -namespace operators { - -template -class ConcatNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - phi::DenseTensor* out = ctx.Output("Out"); - PADDLE_ENFORCE_NOT_NULL(ins[0], - platform::errors::NotFound( - "The first input tensor is not initalized.")); - auto axis = ctx.Attr("axis"); - - if (ctx.HasInput("AxisTensor")) { - PADDLE_THROW(platform::errors::NotFound( - "The AxisTensor is not supported on NPU now.")); - } - axis = ComputeAxis(static_cast(axis), - static_cast(ins[0]->dims().size())); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - std::vector inputs; - std::vector names; - for (size_t i = 0; i < ins.size(); ++i) { - if (ins[i] && ins[i]->numel() > 0) { - inputs.push_back(*ins[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; - } - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*out}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } -}; - -template -class ConcatGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto ins = ctx.MultiInput("X"); - auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); - auto outs = ctx.MultiOutput(framework::GradVarName("X")); - - PADDLE_ENFORCE_NOT_NULL(ins[0], - platform::errors::NotFound( - "The first input tensor is not initalized.")); - - auto axis = ctx.Attr("axis"); - - axis = ComputeAxis(static_cast(axis), - static_cast(ins[0]->dims().size())); - - int offset = 0; - auto stream = - ctx.template device_context() - .stream(); - for (size_t j = 0; j < outs.size(); ++j) { - // For stop gradient - // get output tensor that the name is not kEmptyVarName - if (out_var_names[j] != framework::kEmptyVarName && - outs[j]->numel() != 0UL) { - outs[j]->mutable_data(ctx.GetPlace()); - std::vector offsets; - std::vector sizes; - for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { - if (dim == axis) { - offsets.push_back(offset); - sizes.push_back(ins[j]->dims()[dim]); - } else { - offsets.push_back(0); - sizes.push_back(ins[j]->dims()[dim]); - } - } - const auto& runner = - NpuOpRunner("SliceD", - {*out_grad}, - {*outs[j]}, - {{"offsets", offsets}, {"size", sizes}}); - runner.Run(stream); - } - if (ins[j]->numel() != 0UL) { - offset += ins[j]->dims()[axis]; - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(concat, - ops::ConcatNPUKernel, - ops::ConcatNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ConcatNPUKernel, -#endif - ops::ConcatNPUKernel); - -REGISTER_OP_NPU_KERNEL(concat_grad, - ops::ConcatGradNPUKernel, - ops::ConcatGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ConcatGradNPUKernel, -#endif - ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc deleted file mode 100644 index 44fb1aa5a17595a5abd9127bb7fa77fe242d030d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_op_npu.cc +++ /dev/null @@ -1,688 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/conv_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; -static void CastToFP16(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT16) - .Run(stream); -} - -static void CastToFP32(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT) - .Run(stream); -} - -template -class DepthwiseConvNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1], - platform::errors::InvalidArgument( - "ShapeError: The output channels must be equal to the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1], - input->dims()[1], - platform::errors::InvalidArgument( - "ShapeError: The output channels must be equal to the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], - input->dims()[1])); - } - - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - auto stream = ctx.template device_context().stream(); - - // Transform filter (n, 1, h, w) --> (1, n, h, w) - phi::DenseTensor transformed_filter(filter->type()); - transformed_filter.mutable_data({filter->dims()[1], - filter->dims()[0], - filter->dims()[2], - filter->dims()[3]}, - ctx.device_context().GetPlace()); - std::vector perm = {1, 0, 2, 3}; - const auto& runner_trans = NpuOpRunner( - "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); - runner_trans.Run(stream); - - const auto& runner = NpuOpRunner("DepthwiseConv2D", - {input_tensor, transformed_filter}, - {output_tensor}, - {{"strides", strides}, - {"dilations", dilations}, - {"pads", padding}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class DepthwiseConvGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - auto output_grad = - ctx.Input(framework::GradVarName("Output")); - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - auto filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - auto stream = ctx.template device_context().stream(); - - // Transform filter (n, 1, h, w) --> (1, n, h, w) - phi::DenseTensor transformed_filter(filter->type()); - transformed_filter.mutable_data({filter->dims()[1], - filter->dims()[0], - filter->dims()[2], - filter->dims()[3]}, - ctx.device_context().GetPlace()); - std::vector perm = {1, 0, 2, 3}; - const auto& runner_trans = NpuOpRunner( - "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); - runner_trans.Run(stream); - - // construct NPU attr - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_EQ( - (dilations[2] == 1 && dilations[3] == 1), - true, - platform::errors::InvalidArgument( - "dilation_h and dilation_w in DepthwiseConv2DBackpropFilterD " - "must be equal to 1, but got dilation_h %d, dilation_w %d", - dilation[2], - dilation[3])); - - NpuOpRunner runner; - runner.SetType("DepthwiseConv2DBackpropFilterD") - .AddInput(input_tensor) - .AddInput(output_grad_tensor) - .AddOutput(*filter_grad) - .AddAttr("filter_size", phi::vectorize(transformed_filter.dims())) - .AddAttr("strides", strides) - .AddAttr("dilations", dilations) - .AddAttr("pads", padding) - .AddAttr("data_format", data_format) - .Run(stream); - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - NpuOpRunner runner; - runner.SetType("DepthwiseConv2DBackpropInputD") - .AddInput(transformed_filter) - .AddInput(output_grad_tensor) - .AddOutput(input_grad_tensor) - .AddAttr("input_size", phi::vectorize(input->dims())) - .AddAttr("strides", strides) - .AddAttr("dilations", dilations) - .AddAttr("pads", padding) - .AddAttr("data_format", data_format) - .Run(stream); - } - } -}; - -template -class NPUConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv2D", - {input_tensor, *filter}, - {output_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class NPUConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = - ctx.Input(framework::GradVarName("Output")); - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - auto filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - std::vector filter_shape_vec = phi::vectorize(filter->dims()); - - phi::DenseTensor filter_grad_fp32(phi::DataType::FLOAT32); - filter_grad_fp32.Resize(filter_grad->dims()); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::FP16) { - CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32); - } else { - filter_grad_fp32.ShareDataWith(*filter_grad); - } - - const auto& runner = NpuOpRunner("Conv2DBackpropFilterD", - {input_tensor, output_grad_tensor}, - {filter_grad_fp32}, - {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::FP16) { - CastToFP16(ctx, stream, filter_grad_fp32, filter_grad); - } - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - std::vector input_shape_vec = phi::vectorize(input->dims()); - - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - const auto& runner = NpuOpRunner("Conv2DBackpropInputD", - {*filter, output_grad_tensor}, - {input_grad_tensor}, - {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -template -class NPUConv3dKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - PADDLE_ENFORCE_EQ(data_format, - "NCDHW", - platform::errors::Unimplemented( - "the data_format must be NCDHW in " - "the npu kernel of conv3d, but got data_format " - "= [%s]", - data_format)); - - PADDLE_ENFORCE_EQ(groups, - 1, - platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); - - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto input_tensor = - ctx.AllocateTmpTensor(input->dims(), dev_ctx); - auto filter_tensor = - ctx.AllocateTmpTensor(filter->dims(), dev_ctx); - auto output_tensor = - ctx.AllocateTmpTensor(output->dims(), dev_ctx); - - input_tensor.ShareDataWith(*input); - filter_tensor.ShareDataWith(*filter); - output_tensor.ShareDataWith(*output); - - input_tensor.set_layout(DataLayout::kNCDHW); - filter_tensor.set_layout(DataLayout::kNCDHW); - output_tensor.set_layout(DataLayout::kNCDHW); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(5, 1); - std::vector dilations_vec(5, 1); - - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - strides_vec[4] = strides[2]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - dilations_vec[4] = dilations[2]; - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv3D", - {input_tensor, filter_tensor}, - {output_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class NPUConv3dGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - const phi::DenseTensor* output_grad = - ctx.Input(framework::GradVarName("Output")); - phi::DenseTensor* input_grad = - ctx.Output(framework::GradVarName("Input")); - phi::DenseTensor* filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - PADDLE_ENFORCE_EQ(data_format, - "NCDHW", - platform::errors::Unimplemented( - "the data_format must be NCDHW in " - "the npu kernel of conv3d, but got data_format " - "= [%s]", - data_format)); - - PADDLE_ENFORCE_EQ(groups, - 1, - platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); - - auto& dev_ctx = ctx.template device_context(); - auto input_tensor = - ctx.AllocateTmpTensor(input->dims(), dev_ctx); - auto filter_tensor = - ctx.AllocateTmpTensor(filter->dims(), dev_ctx); - auto output_grad_tensor = ctx.AllocateTmpTensor( - output_grad->dims(), dev_ctx); - - input_tensor.ShareDataWith(*input); - filter_tensor.ShareDataWith(*filter); - output_grad_tensor.ShareDataWith(*output_grad); - - input_tensor.set_layout(DataLayout::kNCDHW); - filter_tensor.set_layout(DataLayout::kNCDHW); - output_grad_tensor.set_layout(DataLayout::kNCDHW); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(5, 1); - std::vector dilations_vec(5, 1); - - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - strides_vec[4] = strides[2]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - dilations_vec[4] = dilations[2]; - - auto stream = ctx.template device_context().stream(); - - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - std::vector filter_shape_vec = phi::vectorize(filter->dims()); - - phi::DenseTensor filter_grad_tensor = - ctx.AllocateTmpTensor(filter_grad->dims(), - dev_ctx); - filter_grad_tensor.ShareDataWith(*filter_grad); - filter_grad_tensor.set_layout(DataLayout::kNCDHW); - - const auto& runner = NpuOpRunner("Conv3DBackpropFilterD", - {input_tensor, output_grad_tensor}, - {filter_grad_tensor}, - {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - std::vector input_shape_vec = phi::vectorize(input->dims()); - - phi::DenseTensor input_grad_tensor = - ctx.AllocateTmpTensor(input_grad->dims(), - dev_ctx); - input_grad_tensor.ShareDataWith(*input_grad); - input_grad_tensor.set_layout(DataLayout::kNCDHW); - - const auto& runner = NpuOpRunner("Conv3DBackpropInputD", - {filter_tensor, output_grad_tensor}, - {input_grad_tensor}, - {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(depthwise_conv2d, - ops::DepthwiseConvNPUKernel, - ops::DepthwiseConvNPUKernel); - -REGISTER_OP_NPU_KERNEL(depthwise_conv2d_grad, - ops::DepthwiseConvGradNPUKernel, - ops::DepthwiseConvGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv2d, - ops::NPUConvOpKernel, - ops::NPUConvOpKernel); - -REGISTER_OP_NPU_KERNEL(conv2d_grad, - ops::NPUConvGradOpKernel, - ops::NPUConvGradOpKernel); - -REGISTER_OP_NPU_KERNEL(conv3d, - ops::NPUConv3dKernel, - ops::NPUConv3dKernel); - -REGISTER_OP_NPU_KERNEL(conv3d_grad, - ops::NPUConv3dGradKernel, - ops::NPUConv3dGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc deleted file mode 100644 index f9da50848df2af9f59f8f166f07a720f60a97307..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_transpose_op_npu.cc +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/cpu/conv_util.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class Conv2DTransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - std::vector output_padding = - ctx.Attr>("output_padding"); - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - // check dimension - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - // construct NPU attr - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - for (auto i = output_padding.size(); i < 4; ++i) { - output_padding.insert(output_padding.begin(), 0); - } - auto output_dim_vec = phi::vectorize(output_tensor.dims()); - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv2DTransposeD", - {input_tensor, *filter}, - {output_tensor}, - {{"input_size", output_dim_vec}, - {"strides", strides}, - {"dilations", dilations}, - {"output_padding", output_padding}, - {"groups", groups}, - {"pads", padding}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class Conv2DTransposeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - const phi::DenseTensor* output_grad = - ctx.Input(framework::GradVarName("Output")); - phi::DenseTensor* input_grad = - ctx.Output(framework::GradVarName("Input")); - phi::DenseTensor* filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - if ((!input_grad) && (!filter_grad)) return; - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const phi::DataLayout data_layout = phi::StringToDataLayout(data_format); - - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - // auto out_grad_dims = output_grad->dims(); - // const int batch_size = static_cast(input->dims()[0]); - - const bool channel_last = (data_layout == phi::DataLayout::kNHWC); - - framework::DDim in_data_dims; - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Conv2DBackpropFilterD", - {output_grad_tensor, input_tensor}, - {*filter_grad}, - {{"filter_size", phi::vectorize(filter_dims)}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - const auto& runner = NpuOpRunner("Conv2D", - {output_grad_tensor, *filter}, - {input_grad_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -template -class Conv3DTransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - std::vector output_padding = - ctx.Attr>("output_padding"); - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - std::string data_format = ctx.Attr("data_format"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - // check dimension - const bool channel_last = data_format == "NHWC"; - - if (data_format == "NHWC") { - data_format = "NDHWC"; - } else { - data_format = "NCDHW"; - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - // construct NPU attr - std::vector strides(5, 1); - std::vector dilations(5, 1); - - phi::DenseTensor input_tensor, output_tensor, filter_tensor; - input_tensor.Resize(input->dims()); - input_tensor.ShareDataWith(*input); - output_tensor.Resize(output->dims()); - output_tensor.ShareDataWith(*output); - filter_tensor.Resize(filter->dims()); - filter_tensor.ShareDataWith(*filter); - - PADDLE_ENFORCE_EQ( - dilation[0], - 1, - platform::errors::InvalidArgument( - "dilation[0] must be equal 1, but received %d.", dilation[0])); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNDHWC); - output_tensor.set_layout(DataLayout::kNDHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - strides[3] = stride[2]; - dilations[2] = dilation[1]; - dilations[3] = dilation[2]; - } else { - input_tensor.set_layout(DataLayout::kNCDHW); - output_tensor.set_layout(DataLayout::kNCDHW); - strides[2] = stride[0]; - strides[3] = stride[1]; - strides[4] = stride[2]; - dilations[3] = dilation[1]; - dilations[4] = dilation[2]; - } - filter_tensor.set_layout(DataLayout::kNCDHW); - - auto output_dim_vec = phi::vectorize(output_tensor.dims()); - - auto& dev_ctx = ctx.template device_context(); - - NpuOpRunner runner; - runner.SetType("Conv3DBackpropInputD") - .AddInput(filter_tensor) - .AddInput(input_tensor) - .AddAttr("input_size", output_dim_vec) - .AddAttr("strides", strides) - .AddAttr("pads", padding) - .AddAttr("dilations", dilations) - .AddAttr("groups", groups) - .AddAttr("data_format", data_format) - .AddOutput(output_tensor); - runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(conv2d_transpose, - ops::Conv2DTransposeNPUKernel, - ops::Conv2DTransposeNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad, - ops::Conv2DTransposeGradNPUKernel, - ops::Conv2DTransposeGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv3d_transpose, - ops::Conv3DTransposeNPUKernel, - ops::Conv3DTransposeNPUKernel); diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc deleted file mode 100644 index 5aaa832ce3383bddd6ed8c819174db62e4289200..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/crop_op_npu.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/crop_op.h" - -namespace paddle { -namespace operators { - -template -class CropNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - std::vector offset_list; - if (ctx.HasInput("Offsets")) { - auto* offsets_tensor = ctx.Input("Offsets"); - paddle::framework::TensorToVector( - *offsets_tensor, ctx.device_context(), &offset_list); - if (offset_list.empty()) { - offset_list.resize(x->dims().size(), 0); - } - } else { - auto res = ctx.Attr>("offsets"); - if (res.empty()) { - offset_list.resize(x->dims().size(), 0); - } else { - offset_list.insert(offset_list.end(), res.begin(), res.end()); - } - } - - PADDLE_ENFORCE_EQ( - static_cast(offset_list.size()), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape (%d) of CropOp's " - "'offset' attribute should be equal to the shape of dims " - "(%d) of the Input(X).", - offset_list.size(), - x->dims().size())); - - int axis_int = 0; - framework::NPUAttributeMap attr_input = {{"offsets", offset_list}, - {"axis", axis_int}}; - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - if (ctx.HasInput("Y")) { - auto* shape = ctx.Input("Y"); - PADDLE_ENFORCE_EQ(shape->dims().size(), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape of dims of (%d) of CropOp's " - "Input(shape) should be equal to the shape of dims " - "(%d) of the Input(X).", - shape->dims().size(), - x->dims().size())); - - // shape memory maybe have gc. - phi::DenseTensor tmp_shape(*shape); - tmp_shape.mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - auto shape_size = ctx.Attr>("shape"); - PADDLE_ENFORCE_EQ(shape_size.size(), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape of dims of (%d) of CropOp's " - "Input(shape) should be equal to the shape of dims " - "(%d) of the Input(X).", - shape_size.size(), - x->dims().size())); - phi::DenseTensor tmp_shape(x->dtype()); - tmp_shape.Resize(phi::make_ddim(shape_size)); - tmp_shape.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - crop, - ops::CropNPUKernel, - ops::CropNPUKernel, - ops::CropNPUKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc deleted file mode 100644 index a5c77922054da5634b469b6c9ef777c6742c6d5d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -static void CumsumImp(const phi::DenseTensor& input, - phi::DenseTensor* output, - const framework::NPUAttributeMap& attr_input, - const framework::ExecutionContext& ctx) { - auto stream = - ctx.template device_context() - .stream(); - if (framework::TransToProtoVarType(input.dtype()) == - framework::proto::VarType::INT64) { - phi::DenseTensor tmp_input; - tmp_input.mutable_data(input.dims(), ctx.GetPlace()); - auto dst_acl_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type())); - const auto& cast_runner_1 = - NpuOpRunner("Cast", - {input}, - {tmp_input}, - {{"dst_type", static_cast(dst_acl_dtype)}}); - cast_runner_1.Run(stream); - - phi::DenseTensor tmp_output; - tmp_output.mutable_data(output->dims(), ctx.GetPlace()); - const auto& runner = - NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input); - runner.Run(stream); - - dst_acl_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(output->type())); - const auto& cast_runner_2 = - NpuOpRunner("Cast", - {tmp_output}, - {*output}, - {{"dst_type", static_cast(dst_acl_dtype)}}); - cast_runner_2.Run(stream); - } else { - const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input); - runner.Run(stream); - } -} - -template -class CumSumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - bool exclusive = ctx.Attr("exclusive"); - bool reverse = ctx.Attr("reverse"); - - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = { - {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}}; - - bool flatten = ctx.Attr("flatten"); - if (flatten) { - PADDLE_ENFORCE_EQ( - axis, - -1, - platform::errors::InvalidArgument( - "when flatten is true, attr axis must be default %d, but got %d", - -1, - axis)); - - phi::DenseTensor new_x(x->type()); - new_x.ShareDataWith(*x); - - new_x.Resize(phi::make_ddim({x->numel()})); - - CumsumImp(new_x, out, attr_input, ctx); - } else { - CumsumImp(*x, out, attr_input, ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - cumsum, - ops::CumSumNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::CumSumNPUKernel, -#endif - ops::CumSumNPUKernel, - ops::CumSumNPUKernel); diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc deleted file mode 100644 index 9c84961f611c0e68e9bb3eb18700492559327c28..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ /dev/null @@ -1,212 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class DropoutNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* seed_tensor = - ctx.HasInput("Seed") ? ctx.Input("Seed") : nullptr; - auto* out = ctx.Output("Out"); - auto* mask = ctx.Output("Mask"); - - auto dropout_prob = ctx.Attr("dropout_prob"); - auto is_test = ctx.Attr("is_test"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - if (dropout_prob == 1.) { - const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out}); - runner_zeros_out.Run(stream); - mask->mutable_data(ctx.GetPlace()); - const auto& runner_zeros_mask = - NpuOpRunner("ZerosLike", {*mask}, {*mask}); - runner_zeros_mask.Run(stream); - return; - } - - // only achieve the default `upscale_in_train` method - if (!is_test) { - phi::DenseTensor tmp_x(x->dtype()); - phi::DenseTensor tmp_out(out->dtype()); - tmp_x.ShareDataWith(*x); - tmp_out.ShareDataWith(*out); - if (x->dims().size() == 1) { - // DropOutDoMask will get error result when input - // is 1-D. Make it become 2-D. - std::vector vec_dim = phi::vectorize(x->dims()); - tmp_x.Resize(phi::make_ddim({vec_dim[0], 1})); - tmp_out.Resize(phi::make_ddim({vec_dim[0], 1})); - } - - int seed = 0; - int seed2 = 0; - float keep_prob = 1. - dropout_prob; - if (seed_tensor) { - std::vector seed_data; - paddle::framework::TensorToVector( - *seed_tensor, ctx.device_context(), &seed_data); - seed = seed_data[0]; - } else { - seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : 0; - } - - phi::DenseTensor keep_prob_tensor(x->dtype()); - keep_prob_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&keep_prob_tensor, - static_cast(keep_prob)); - - mask->mutable_data(ctx.GetPlace()); - - // mask used in `DropOutGenMask` NPU OP is different from - // the output `Mask`. - phi::DenseTensor npu_mask(phi::DataType::UINT8); - uint32_t length = (x->numel() + 128 - 1) / 128 * 128; - npu_mask.Resize(phi::make_ddim({length / 8})); - npu_mask.mutable_data(ctx.GetPlace()); - - // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU - // OP must be a scalar with shape[0]. At present, the shape - // of the `prob` phi::DenseTensor of this OP is forced to be set to 0 - // in `npu_op_runner.cc`, which needs to be optimized later. - NpuOpRunner runner_gen_mask; - runner_gen_mask.SetType("DropOutGenMask") - .AddInput(phi::vectorize(tmp_out.dims())) - .AddInput(keep_prob_tensor) - .AddOutput(npu_mask) - .AddAttr("seed", seed) - .AddAttr("seed2", seed2); - runner_gen_mask.Run(stream); - - NpuOpRunner runner_dropout; - runner_dropout.SetType("DropOutDoMask") - .AddInput(tmp_x) - .AddInput(npu_mask) - .AddInput(keep_prob_tensor) - .AddOutput(tmp_out); - runner_dropout.Run(stream); - - // cast `out` from float/float16 to bool - phi::DenseTensor cast_mask(phi::DataType::BOOL); - cast_mask.Resize(mask->dims()); - cast_mask.mutable_data(ctx.GetPlace()); - auto dst_dtype_bool = - ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype())); - const auto& runner_cast_mask_bool = - NpuOpRunner("Cast", - {*out}, - {cast_mask}, - {{"dst_type", static_cast(dst_dtype_bool)}}); - runner_cast_mask_bool.Run(stream); - - // cast cast_mask from bool to uint8 - auto dst_dtype_uint8 = - ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype())); - const auto& runner_cast_mask_uint8 = - NpuOpRunner("Cast", - {cast_mask}, - {*mask}, - {{"dst_type", static_cast(dst_dtype_uint8)}}); - runner_cast_mask_uint8.Run(stream); - } else { - framework::TensorCopy( - *x, - ctx.GetPlace(), - ctx.template device_context(), - out); - } - } -}; - -template -class DropoutGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* mask = ctx.Input("Mask"); - - auto dropout_prob = ctx.Attr("dropout_prob"); - auto is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ(is_test, - false, - platform::errors::PreconditionNotMet( - "GradOp is only callable when is_test is false")); - - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - if (dropout_prob == 1.) { - const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx}); - runner_zeros.Run(stream); - return; - } - - // cast mask from uint8 to float32/float16 - phi::DenseTensor cast_mask(dx->dtype()); - cast_mask.Resize(mask->dims()); - cast_mask.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype())); - const auto& runner_cast_mask = - NpuOpRunner("Cast", - {*mask}, - {cast_mask}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mask.Run(stream); - - const auto& runner = - NpuOpRunner("MaskedScale", - {*dout, cast_mask}, - {*dx}, - {{"value", static_cast(1. / (1 - dropout_prob))}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - dropout, - ops::DropoutNPUKernel, - ops::DropoutNPUKernel); - -REGISTER_OP_NPU_KERNEL( - dropout_grad, - ops::DropoutGradNPUKernel, - ops::DropoutGradNPUKernel); diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc deleted file mode 100644 index 77f12f17ce258689f5ba1d6fe611e5578af455db..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/expand_as_v2_op_npu.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/expand_as_v2_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandAsV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, - rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, - rank)); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument("The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, - MAX_RANK_SUPPORTED)); - ExpandAs(context); - } - - protected: - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], - 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], - target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in " - "target tensor for expand_as_v2 op.", - vec_in_dims[i], - target_shape[i])); - } - } - auto* out0 = context.Output("Out"); - - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - out0->mutable_data(context.GetPlace()); - - const auto& runner = - NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}}); - - auto stream = - context.template device_context() - .stream(); - - runner.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel); diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc deleted file mode 100644 index d7e553b83bb67bd9d86d948c1b91ff2219799457..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/expand_op_npu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'x' for Op(expand) " - "must be greater than or equal to 1, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'x' for Op(expand) " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - switch (rank) { - case 1: - Expand<1>(context); - break; - case 2: - Expand<2>(context); - break; - case 3: - Expand<3>(context); - break; - case 4: - Expand<4>(context); - break; - case 5: - Expand<5>(context); - break; - case 6: - Expand<6>(context); - break; - } - } - - protected: - template - void Expand(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto expand_times = get_expand_times(context); - PADDLE_ENFORCE_EQ(static_cast(in_dims.size()), - expand_times.size(), - platform::errors::InvalidArgument( - "The number of elements (%d) of 'expand_times' for " - "Op(expand) must be equal to the number " - "of dimensions (%d) of the input.", - expand_times.size(), - static_cast(in_dims.size()))); - auto* out0 = context.Output("Out"); - framework::DDim out_dims(in_dims); - - for (size_t i = 0; i < expand_times.size(); ++i) { - out_dims[i] *= expand_times[i]; - } - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - out0->Resize(out_dims); - out0->mutable_data(place); - - bool is_expand_times_all_one = - (out0->numel() == in0->numel()) ? true : false; - - if (is_expand_times_all_one) { - memory::Copy(place, - out0->mutable_data(place), - place, - in0->data(), - in0->numel() * sizeof(T), - stream); - if (out_dims != in_dims) { - out0->Resize(out_dims); - } - } else { - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand, - ops::ExpandNPUKernel, - ops::ExpandNPUKernel, - ops::ExpandNPUKernel); diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc deleted file mode 100644 index e9d12beaa78dea7d64f7ad4ca5cfa19f3a6f4bd5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(expand); -USE_OP_DEVICE_KERNEL(expand, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto in = scope->Var("X"); - auto expand_times = scope->Var("ExpandTimes"); - auto out = scope->Var("Out"); - auto in_t = in->GetMutable(); - auto out_t = out->GetMutable(); - auto expand_times_t = expand_times->GetMutable(); - - auto place = ctx.GetPlace(); - paddle::framework::TensorFromVector(std::vector(3 * 1 * 7, 1), ctx, in_t); - paddle::framework::TensorFromVector( - std::vector({1, 10, 1}), ctx, expand_times_t); - - in_t->Resize(phi::make_ddim({3, 1, 7})); - expand_times_t->Resize(phi::make_ddim({3})); - out_t->Resize(phi::make_ddim({3, 10, 7})); - out_t->mutable_data(place); - - f::AttributeMap attrs = {{}}; - auto op = - f::OpRegistry::CreateOp("expand", - {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}}, - {{"Out", {"Out"}}}, - attrs); - op->Run(*scope, place); - ctx.Wait(); - - auto out_dim = out_t->dims(); - EXPECT_EQ(out_dim.at(0), 3); - EXPECT_EQ(out_dim.at(1), 10); - EXPECT_EQ(out_dim.at(2), 7); -} - -TEST(expand, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc deleted file mode 100644 index 7f37fc67d529dec9585969e4238d1ecd61e46659..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/expand_v2_op_npu.cc +++ /dev/null @@ -1,235 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_v2_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Out = ctx.Output("Out"); - - auto in_dims = X->dims(); - auto expand_shape = get_expand_shape(ctx); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = expand_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector final_expand_shape(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(expand_shape[i], - 0, - platform::errors::InvalidArgument( - "The expanded size cannot be zero.")); - if (i < diff) { // expand_shape = [3,4,-1,-1], X = [10,2] --> - // final_expand_shape = [3,4,10,2] - PADDLE_ENFORCE_GT( - expand_shape[i], - 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_v2 op.", - expand_shape[i])); - final_expand_shape[i] = expand_shape[i]; - } else if (expand_shape[i] > 0) { // expand_shape = [3,4,10,4], X = - // [10,1] --> final_expand_shape = - // [3,4,10,4] - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], - expand_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_v2 op.", - vec_in_dims[i], - expand_shape[i])); - final_expand_shape[i] = expand_shape[i]; - } else { - final_expand_shape[i] = expand_shape[i]; - } - } else { // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape - // = [3,4,10,2] - PADDLE_ENFORCE_EQ( - expand_shape[i], - -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_v2 op, " - "only -1 is supported, but the value received is %d.", - expand_shape[i])); - final_expand_shape[i] = vec_in_dims[i]; - } - } - - framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}}; - - auto rank = X->dims().size(); - - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The rank of the input 'X' for expand_v2_npu op must be positive, " - "but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'X' for expand_v2_npu op must be less than " - "or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - auto shape_size = final_expand_shape.size(); - PADDLE_ENFORCE_GE( - shape_size, - rank, - platform::errors::InvalidArgument( - "The number (%d) of elements of 'shape' for expand_v2_npu op must " - "be " - "greater than or equal to the rank (%d) of the input 'X'.", - shape_size, - rank)); - PADDLE_ENFORCE_LE(shape_size, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number (%d) of elements of 'shape' for " - "expand_v2_npu op must be " - "less than or equal to %d.", - shape_size, - MAX_RANK_SUPPORTED)); - - framework::DDim out_dims = phi::make_ddim(final_expand_shape); - Out->Resize(out_dims); - Out->mutable_data(ctx.GetPlace()); - - const auto& dev_ctx = - ctx.template device_context(); - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(X->dtype()) == - framework::proto::VarType::BOOL) { - NpuOpRunner::TypeAdapter({*X}, - {*Out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } else if (framework::TransToProtoVarType(X->dtype()) == - framework::proto::VarType::INT64) { - NpuOpRunner::TypeAdapter({*X}, - {*Out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input); - runner.Run(dev_ctx.stream()); - } - } -}; - -template -class ExpandV2NPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - // case 1: reduce dout dims to dx dims - // For example: [2, 120] --> [120] - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - - phi::DenseTensor tmp_dout(dout->dtype()); - phi::DenseTensor reduced_dout(dx->dtype()); - tmp_dout.ShareDataWith(*dout); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - tmp_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReduceSumD", - {*dout}, - {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = reduced_dout; - } - - // case 2: reduce axis of dout in which dim is 1 - // For example: [12, 140] --> [1, 140] - - // case 3: copy dout to dx when shape is totally same, and dim in dx != 1 - // For example: [2, 10, 5] --> [2, 10, 5] - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { - axes.push_back(i); - } - } - if (axes.size() != 0) { - const auto& runner = NpuOpRunner("ReduceSumD", - {tmp_dout}, - {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - } else { - framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand_v2, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - expand_v2_grad, - ops::ExpandV2NPUGradKernel, - ops::ExpandV2NPUGradKernel, - ops::ExpandV2NPUGradKernel); diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc deleted file mode 100644 index ee71ebee9b0665ff1bd2c9fd9758938f0e90bd49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eye_op_npu.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class EyeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto num_rows = ctx.Attr("num_rows"); - - auto d_nums = ctx.Attr("dtype"); - auto dtype = - ConvertToNpuDtype(static_cast(d_nums)); - - auto num_columns = ctx.Attr("num_columns"); - if (num_columns == -1) num_columns = num_rows; - - framework::NPUAttributeMap attr_input = { - {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}}; - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - eye, - ops::EyeNPUKernel, - ops::EyeNPUKernel, - ops::EyeNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc deleted file mode 100644 index 62d3e5a82f5a3238343f8c7ea41c7fff54b63462..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class FillAnyLikeNPUKernel : public framework::OpKernel { - public: - using CommonType = typename std::common_type< - float, - typename std::conditional::value, - float, - T>::type>::type; - - void Compute(const framework::ExecutionContext& context) const override { - auto data_type = static_cast( - context.Attr("dtype")); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - float value = context.Attr("value"); - - auto common_type_value = static_cast(value); - - PADDLE_ENFORCE_EQ( - (common_type_value >= - static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, - platform::errors::InvalidArgument( - "The filled value is out of range for target type, " - "current kernel type is %s, the range should between %f " - "and %f, but now value is %f.", - typeid(T).name(), - static_cast(std::numeric_limits::lowest()), - static_cast(std::numeric_limits::max()), - value)); - - PADDLE_ENFORCE_EQ( - std::isnan(value), - false, - platform::errors::InvalidArgument("The filled value is NaN.")); - - Tensor tensor_tmp(framework::TransToPhiDataType(data_type)); - tensor_tmp.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&tensor_tmp, static_cast(value)); - - auto stream = - context.template device_context() - .stream(); - - auto shape = out->dims(); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_tmp) - .AddOutput(*out) - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(fill_any_like, - ops::FillAnyLikeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::FillAnyLikeNPUKernel, -#endif - ops::FillAnyLikeNPUKernel, - ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc deleted file mode 100644 index fed75fc018a0c49d8cc3de168e214b673d0438f6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto data_type = - static_cast(ctx.Attr("dtype")); - auto float_value = ctx.Attr("value"); - auto str_value = ctx.Attr("str_value"); - auto force_cpu = ctx.Attr("force_cpu"); - - auto *out = ctx.Output("Out"); - auto *in = ctx.Input("Input"); - if (in->lod().size() && ctx.Attr("input_dim_idx") == 0) { - // set the correct batch size for the phi::DenseTensor. - auto odims = out->dims(); - int output_dim_idx = ctx.Attr("output_dim_idx"); - odims[output_dim_idx] = static_cast(in->lod().back().size()) - 1; - out->mutable_data(odims, ctx.GetPlace()); - } - - T value; - if (str_value.empty()) { - value = static_cast(float_value); - } else { - // handle NaN/Inf first, which cannot be read from stream. - if (str_value == "inf") { - value = static_cast(std::numeric_limits::infinity()); - } else if (str_value == "-inf") { - value = static_cast(-std::numeric_limits::infinity()); - } else if (str_value == "nan") { - value = static_cast(std::numeric_limits::quiet_NaN()); - } else { - std::stringstream convert_stream(str_value); - if (std::is_same::value) { - int64_t tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } else { - double tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } - } - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); - if (cpu_place) { - auto &dev_ctx = *pool.Get(platform::CPUPlace()); - phi::funcs::SetConstant functor; - out->mutable_data(platform::CPUPlace(), - framework::TransToPhiDataType(data_type)); - functor(reinterpret_cast(dev_ctx), - out, - static_cast(value)); - } else { - out->mutable_data(ctx.GetPlace(), - framework::TransToPhiDataType(data_type)); - phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type)); - tensor_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_tmp, value); - - auto stream = - ctx.template device_context() - .stream(); - const auto &runner = NpuOpRunner("FillD", - {tensor_tmp}, - {*out}, - {{"dims", phi::vectorize(out->dims())}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - float>, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - int>, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - paddle::platform::float16>); diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc deleted file mode 100644 index 0724caf32793e0317557320cea7a86f98ab82e05..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" - -namespace paddle { -namespace operators { - -template -class FillConstantNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto data_type = - static_cast(ctx.Attr("dtype")); - auto str_value = ctx.Attr("str_value"); - auto float_value = ctx.Attr("value"); - - auto *out_var = ctx.Output("Out"); - auto stream = - ctx.template device_context() - .stream(); - - T value; - if (str_value.empty()) { - value = static_cast(float_value); - } else { - // handle NaN/Inf first, which cannot be read from stream. - if (str_value == "inf") { - value = static_cast(std::numeric_limits::infinity()); - } else if (str_value == "-inf") { - value = static_cast(-std::numeric_limits::infinity()); - } else if (str_value == "nan") { - value = static_cast(std::numeric_limits::quiet_NaN()); - } else { - std::stringstream convert_stream(str_value); - if (std::is_same::value) { - int64_t tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } else { - double tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } - } - } - auto shape = GetShape(ctx); - - out_var->mutable_data(shape, ctx.GetPlace()); - if (data_type != framework::proto::VarType::BOOL) { - Tensor tensor_value(framework::TransToPhiDataType(data_type)); - tensor_value.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_value, value); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_value) - .AddOutput(*out_var) - .Run(stream); - } else { - const auto &dev_ctx = - ctx.template device_context(); - auto op_func = [&shape, &value]( - const std::vector &inputs, - const std::vector &outputs, - const NPUAttributeMap &attrs, - const platform::NPUDeviceContext &dev_ctx) { - Tensor tensor_value; - tensor_value.mutable_data({1}, dev_ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_value, - static_cast(value)); - - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_value) - .AddOutput(outputs[0]) - .Run(dev_ctx.stream()); - }; - NpuOpRunner::TypeAdapter({}, - {*out_var}, - {}, - dev_ctx, - op_func, - {}, - {framework::proto::VarType::UINT8}); - } - } -}; -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - fill_constant, - paddle::operators::FillConstantNPUKernel, - paddle::operators::FillConstantNPUKernel, - paddle::operators::FillConstantNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::FillConstantNPUKernel, -#endif - paddle::operators::FillConstantNPUKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op_npu.cc b/paddle/fluid/operators/fill_zeros_like_op_npu.cc deleted file mode 100644 index 6cedc658f76f5dc2876ccff2fd5397b10d9d5b70..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/fill_zeros_like_op.h" - -namespace paddle { -namespace operators { - -template -class FillZerosLikeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - - out->mutable_data(context.GetPlace()); - auto stream = - context.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel); diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc deleted file mode 100644 index 2e43c33efd575bc65e0679fdbb0b5ef9723a11df..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/flatten_op_npu.cc +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/flatten_op.h" - -namespace paddle { -namespace operators { - -template -class Flatten2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *in = context.Input("X"); - auto *out = context.Output("Out"); - auto &axis = context.Attr("axis"); - out->mutable_data(context.GetPlace(), in->type()); - framework::NPUAttributeMap attr_input = {{"axis", axis}}; - - auto stream = - context.template device_context() - .stream(); - const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input); - runner.Run(stream); - } -}; - -template -class Flatten2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - - auto xshape_dims = ctx.Input("XShape")->dims(); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(x_dims); - } -}; - -template -class FlattenContiguousRangeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *X = ctx.Input("X"); - auto *Out = ctx.Output("Out"); - int start_axis = ctx.Attr("start_axis"); - int stop_axis = ctx.Attr("stop_axis"); - - Out->mutable_data(ctx.GetPlace()); - - const auto &runner = - NpuOpRunner("FlattenV2", - {*X}, - {*Out}, - {{"axis", static_cast(start_axis)}, - {"end_axis", static_cast(stop_axis)}}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class FlattenContiguousRangeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - - auto xshape_dims = ctx.Input("XShape")->dims(); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(x_dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(flatten2, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel); -REGISTER_OP_NPU_KERNEL(flatten2_grad, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - flatten_contiguous_range, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel); -REGISTER_OP_NPU_KERNEL( - flatten_contiguous_range_grad, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel); diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc deleted file mode 100644 index feb1567e58d78d5d3d1249422d81f5802752f4c9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class GatherNdNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); - - out->template mutable_data(ctx.GetPlace()); - - if (x->numel() == 0) return; - - if (index->numel() == 0) { - framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out); - return; - } - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, - true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = ctx.template device_context().stream(); - runner.Run(stream); - } -}; - -template -class GatherNdGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *index = ctx.Input("Index"); - auto *x = ctx.Input("X"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *p = dx->mutable_data(ctx.GetPlace()); - - if (dx->numel() == 0) return; - - if (index->numel() == 0) { - framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx); - return; - } - - phi::DenseTensor tmp_tensor(index->type()); - phi::DenseTensor tmp_tensor2(dout->type()); - const auto index_dims = index->dims(); - if (index_dims.size() == 1) { - tmp_tensor.ShareDataWith(*index); - std::vector new_dim = {1, index_dims[0]}; - tmp_tensor.Resize(phi::make_ddim(new_dim)); - index = &tmp_tensor; - - tmp_tensor2.ShareDataWith(*dout); - std::vector new_dim2{1}; - for (int i = index->numel(); i < x->dims().size(); i++) { - new_dim2.push_back(x->dims()[i]); - } - tmp_tensor2.Resize(phi::make_ddim(new_dim2)); - dout = &tmp_tensor2; - } - - auto stream = ctx.template device_context().stream(); - platform::NPUMemsetAsync( - static_cast(p), 0, dx->numel() * sizeof(T), stream); - - const auto &runner_scatter = NpuOpRunner( - "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}}); - runner_scatter.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(gather_nd, - ops::GatherNdNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::GatherNdNPUKernel, -#endif - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL(gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc deleted file mode 100644 index ab42d78a0a1d74a2f7cf6f4d207ab61dd477b2ca..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op_npu.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class GatherOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - const auto &runner = NpuOpRunner( - "Gather", {*x, *index}, {*out}, {{"validate_indices", true}}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class GatherGradOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *index = ctx.Input("Index"); - auto *x = ctx.Input("X"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - // step1: Unsqueeze index - phi::DenseTensor tmp_tensor(index->type()); - const auto index_dims = index->dims(); - if (index_dims.size() == 1) { - tmp_tensor.ShareDataWith(*index); - std::vector new_dim = {index_dims[0], 1}; - tmp_tensor.Resize(phi::make_ddim(new_dim)); - index = &tmp_tensor; - } - - auto stream = - ctx.template device_context() - .stream(); - - // step2: ZerosLike x in device - Tensor zeroslike_xout(dx->type()); - zeroslike_xout.Resize(x->dims()); - auto p = zeroslike_xout.mutable_data(ctx.GetPlace()); - - platform::NPUMemsetAsync( - static_cast(p), 0, zeroslike_xout.numel() * sizeof(T), stream); - - // step3: scatter(x_grad) - const auto &runner_scatter = NpuOpRunner( - "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); - runner_scatter.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather, - ops::GatherOpNPUKernel, - ops::GatherOpNPUKernel, - ops::GatherOpNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_grad, - ops::GatherGradOpNPUKernel, - ops::GatherGradOpNPUKernel, - ops::GatherGradOpNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc deleted file mode 100644 index 69d82ecaedeea45cea5cd7c66f9f2e6acb6073d0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(gather); -USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP_ITSELF(gather_grad); -USE_OP_DEVICE_KERNEL(gather_grad, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto index = scope->Var("Index"); - auto tensor_index = index->GetMutable(); - - std::vector init_x; - for (int64_t i = 1; i < 7; ++i) { - // 1,2,3,4,5,6 - init_x.push_back(static_cast(i)); - } - - // [[1, 2],[3, 4],[5, 6]] - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({3, 2})); - - std::vector init_index = {1, 2}; - paddle::framework::TensorFromVector(init_index, ctx, tensor_index); - tensor_index->Resize(phi::make_ddim({2})); - - ctx.Wait(); - - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - f::AttributeMap attrs = {{"validate_indices", true}}; - auto op = f::OpRegistry::CreateOp( - op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather - for (int i = 0; i < static_cast(out_vec.size()); ++i) { - VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; - } - uint32_t expected_size = 4; - EXPECT_EQ((uint32_t)out_vec.size(), expected_size); - - // {3, 4, 5, 6} - std::vector expected_out_vec; - for (int64_t i = 3; i < 7; ++i) { - expected_out_vec.push_back(static_cast(i)); - } - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], expected_out_vec[i]); - } -} - -template -void CompareGrad(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto index = scope->Var("Index"); - auto tensor_index = index->GetMutable(); - - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - std::vector init_index = {0, 1}; - paddle::framework::TensorFromVector(init_index, ctx, tensor_index); - tensor_index->Resize(phi::make_ddim({2})); - - std::vector init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({3, 2})); - - std::vector init_dout = {5.0, 10.0, 2.0, 3.0}; - paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); - tensor_dout->Resize(phi::make_ddim({2, 2})); - - ctx.Wait(); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - // run - f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp( - op_type, - {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector dx_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); - - ctx.Wait(); - - uint32_t expected_size = 3 * 2; - EXPECT_EQ((uint32_t)dx_vec.size(), expected_size); - - std::vector expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0}; - for (uint32_t i = 0; i < dx_vec.size(); i++) { - VLOG(3) << "dx_vec[i]=" << dx_vec[i]; - EXPECT_EQ(dx_vec[i], expected_dx_vec[i]); - } -} - -TEST(gather, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "gather"); -} - -TEST(gather, NPU_fp16) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "gather"); -} - -TEST(gather_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx, "gather_grad"); -} diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc deleted file mode 100644 index 9b3c23ad2b9c19dab0ca6465b549f76e1b5d3907..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gaussian_random_op_npu.cc +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/generator.h" - -namespace paddle { -namespace operators { - -template -class NPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - tensor->mutable_data(context.GetPlace()); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::normal_distribution dist(mean, std); - - int64_t size = tensor->numel(); - - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - cpu_data[i] = dist(*engine); - } - framework::TensorCopy( - cpu_tensor, - context.GetPlace(), - context.template device_context(), - tensor); - context.template device_context() - .Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(gaussian_random, ops::NPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc deleted file mode 100644 index 1b40a6fbb454c19487e25b18a8dd39478c06d688..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -template -class GeluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class GeluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(pangyoki): In the original implementation of GeluGrad op, the input - // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable - // `out` was not actually used. In order to improve performance, the - // useless GELU operation was deleted. - // We directly use `*dout` as a placeholder to replace `out`, it will not - // be used in calculations. - const auto& runner_dx = - NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - gelu, - ops::GeluNPUKernel, - ops::GeluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gelu_grad, - ops::GeluGradNPUKernel, - ops::GeluGradNPUKernel); diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc deleted file mode 100644 index 9dca0bb8cba0f569a2834c79300ba55b3cab0c22..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(gelu); -USE_OP_DEVICE_KERNEL(gelu, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - f::AttributeMap attrs; - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - - auto op = f::OpRegistry::CreateOp( - "gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - op->Run(*scope, place); - - ctx.Wait(); - - // eval time - struct timeval start, end; - gettimeofday(&start, NULL); - - for (int i = 0; i < 100; i++) { - op->Run(*scope, place); - } - - ctx.Wait(); - - gettimeofday(&end, NULL); - int micros = - (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); - printf("used time: %d\n", micros / 100); - - // eval value - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - float expected = 0.841192; - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_FLOAT_EQ(out_vec[i], static_cast(expected)); - } -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init_dout; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_dout.push_back(static_cast(1.0)); - } - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); - tensor_dout->Resize({10, 10}); - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - f::AttributeMap attrs; - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - - auto op = f::OpRegistry::CreateOp("gelu_grad", - {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - op->Run(*scope, place); - - ctx.Wait(); - - // eval time - struct timeval start, end; - gettimeofday(&start, NULL); - - for (int i = 0; i < 100; i++) { - op->Run(*scope, place); - } - - ctx.Wait(); - - gettimeofday(&end, NULL); - int micros = - (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); - printf("used time: %d\n", micros / 100); - - // eval value - std::vector dx_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); - - float expected = 1.082964; - for (uint32_t i = 0; i < dx_vec.size(); i++) { - EXPECT_FLOAT_EQ(dx_vec[i], static_cast(expected)); - } -} - -TEST(gelu, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(gelu_grad, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc deleted file mode 100644 index 49fdd3566825bc464f3a85649d175ecfb8b3faf3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/group_norm_op_npu.cc +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -struct GroupNormFunction { - public: - explicit GroupNormFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - void ReduceMean(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - // y should be init first - const auto& runner = NpuOpRunner( - "ReduceMeanD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void ReduceSum(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - // y should be init first - const auto& runner = NpuOpRunner( - "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Div(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void DivNoNan(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - // y should be init first - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Sqrt(const phi::DenseTensor* x, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x, - const DataLayout& data_layout, - const int64_t N, - const int64_t C, - const int64_t H, - const int64_t W, - const int G) { - phi::DenseTensor y(x->type()); - // y.mutable_data( {N,G,1}, place ); - if (data_layout == DataLayout::kNCHW) { - y.mutable_data({N, G, 1}, place); - // shape of x is [N, G, C*H*W/G] - this->ReduceMean(x, &y, std::vector{2}); - } else { - y.mutable_data({N, 1, G}, place); - // shape of x is [N, C*H*W/G, G] - phi::DenseTensor x_trans(x->type()); - x_trans.mutable_data({N, G, C * H * W / G}, place); - this->Transpose(x, &x_trans, std::vector{0, 2, 1}); - this->ReduceMean(&x_trans, &y, std::vector{2}); - } - return y; - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -class GroupNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - const float epsilon = ctx.Attr("epsilon"); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* x = ctx.Input("X"); - - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* var = ctx.Output("Variance"); - const auto groups = ctx.Attr("groups"); - - auto place = ctx.GetPlace(); - phi::DenseTensor xnorm(x->type()); - xnorm.mutable_data(x->dims(), place); - GroupNormFunction F(ctx); - if (data_layout != DataLayout::kNCHW) { - xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); - F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); - } else { - paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm); - } - auto N = xnorm.dims()[0]; - auto C = xnorm.dims()[1]; - auto H = xnorm.dims()[2]; - auto W = xnorm.dims()[3]; - xnorm.Resize({N * groups, C * H * W / groups}); - std::vector axis = {1}; - auto reduce_dim = mean->dims(); - - mean->mutable_data({N * groups, 1}, place); - var->mutable_data({N * groups, 1}, place); - y->mutable_data(place); - F.ReduceMean(&xnorm, mean, axis); - - F.Sub(&xnorm, mean, &xnorm); - phi::DenseTensor sqr(x->type()); - sqr.mutable_data(xnorm.dims(), place); - - F.Mul(&xnorm, &xnorm, &sqr); - F.ReduceMean(&sqr, var, axis); - phi::DenseTensor std(x->type()); - std.mutable_data(var->dims(), place); - F.Adds(var, epsilon, &std); - F.Sqrt(&std, &std); - y->Resize(xnorm.dims()); - F.Div(&xnorm, &std, y); - y->Resize({N, C, H, W}); - if (scale) { - phi::DenseTensor scale_t(scale->type()); - scale_t.ShareDataWith(*scale); - scale_t.Resize({C, 1, 1}); - F.Mul(y, &scale_t, y); - } - if (bias) { - phi::DenseTensor bias_t(bias->type()); - bias_t.ShareDataWith(*bias); - bias_t.Resize({C, 1, 1}); - F.Add(y, &bias_t, y); - } - if (data_layout != DataLayout::kNCHW) { - F.Transpose(y, y, std::vector{0, 2, 3, 1}); - y->Resize({x->dims()}); - } - mean->Resize(reduce_dim); - var->Resize(reduce_dim); - } -}; - -template -class GroupNormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - const float epsilon = ctx.Attr("epsilon"); - auto* y = ctx.Input("Y"); - auto* var = ctx.Input("Variance"); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* d_y = ctx.Input(framework::GradVarName("Y")); - const auto G = ctx.Attr("groups"); - - // init output - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - - GroupNormFunction F(ctx); - auto place = ctx.GetPlace(); - auto _type = y->type(); - - phi::DenseTensor xnorm(_type); - xnorm.mutable_data(y->dims(), place); - phi::DenseTensor scale_share(_type); - scale_share.ShareDataWith(*scale); - phi::DenseTensor bias_share(_type); - bias_share.ShareDataWith(*bias); - - int64_t N = y->dims()[0]; - int64_t C, H, W; - framework::DDim scale_bias_dim; - if (data_layout == DataLayout::kNCHW) { - C = y->dims()[1]; - H = y->dims()[2]; - W = y->dims()[3]; - scale_bias_dim = phi::make_ddim({C, 1, 1}); - } else { - C = y->dims()[3]; - H = y->dims()[1]; - W = y->dims()[2]; - scale_bias_dim = phi::make_ddim({1, 1, C}); - } - scale_share.Resize(scale_bias_dim); - bias_share.Resize(scale_bias_dim); - F.Sub(y, &bias_share, &xnorm); - F.DivNoNan(&xnorm, &scale_share, &xnorm); - - if (d_bias) { - d_bias->mutable_data(place); - if (data_layout == DataLayout::kNCHW) { - F.ReduceSum(d_y, d_bias, std::vector{0, 2, 3}, false); - } else { - F.ReduceSum(d_y, d_bias, std::vector{0, 1, 2}, false); - } - } - if (d_scale) { - d_scale->mutable_data(place); - phi::DenseTensor dy_xnorm(_type); - dy_xnorm.mutable_data(d_y->dims(), place); - F.Mul(d_y, &xnorm, &dy_xnorm); - if (data_layout == DataLayout::kNCHW) { - F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 2, 3}); - } else { - F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 1, 2}); - } - } - - // std = Sqrt(var+epsilon), init shape = [ N, G ] - phi::DenseTensor std(_type); - std.mutable_data(var->dims(), place); - F.Adds(var, epsilon, &std); - F.Sqrt(&std, &std); - // d_xnorm_std = dy_proc * scale / std - phi::DenseTensor d_xnorm_std(_type); - d_xnorm_std.mutable_data(y->dims(), place); - F.Mul(d_y, &scale_share, &d_xnorm_std); - if (data_layout == DataLayout::kNCHW) { - xnorm.Resize({N, G, C * H * W / G}); - d_xnorm_std.Resize({N, G, C * H * W / G}); - std.Resize({N, G, 1}); - } else { - xnorm.Resize({N, C * H * W / G, G}); - d_xnorm_std.Resize({N, C * H * W / G, G}); - std.Resize({N, 1, G}); - } - F.Div(&d_xnorm_std, &std, &d_xnorm_std); - - // d_x = d_xnorm_std - // - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm - // - Mean ( d_xnorm_std, axis=1, keepdim=True ) - d_x->mutable_data(place); - d_x->Resize(xnorm.dims()); - F.Mul(&d_xnorm_std, &xnorm, d_x); - phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G); - F.Mul(&dx1, &xnorm, d_x); - - phi::DenseTensor dx2 = - F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G); - - F.Sub(&d_xnorm_std, d_x, d_x); - F.Sub(d_x, &dx2, d_x); - - d_x->Resize(y->dims()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(group_norm, - ops::GroupNormNPUKernel, - ops::GroupNormNPUKernel); -REGISTER_OP_NPU_KERNEL(group_norm_grad, - ops::GroupNormGradNPUKernel, - ops::GroupNormGradNPUKernel); diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc deleted file mode 100644 index 4812dfa47dfedbbc56524c380cd01ed92fd3b353..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/huber_loss_op_npu.cc +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void HuberLossSub(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // Calculate z = x - y - z->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); -} - -template -void HuberLossMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scalar, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); -} - -template -void HuberLossZerosLike(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - phi::DenseTensor* y) { - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {}); - runner.Run(stream); -} - -template -void HuberLossSmoothL1Loss(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - float delta, - phi::DenseTensor* z) { - z->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}}); - runner.Run(stream); -} - -template -void HuberLossSmoothL1LossGrad(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* pred, - const phi::DenseTensor* lab, - const phi::DenseTensor* dout, - float sigma, - phi::DenseTensor* grad) { - grad->mutable_data(pred->dims(), place); - const auto& runner = NpuOpRunner( - "SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}}); - runner.Run(stream); -} - -template -class HuberLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in0 = ctx.Input("X"); - auto* in1 = ctx.Input("Y"); - auto* residual = ctx.Output("Residual"); - auto* out = ctx.Output("Out"); - auto delta = ctx.Attr("delta"); - - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - HuberLossSub(place, stream, in1, in0, residual); - - HuberLossSmoothL1Loss(place, stream, in0, in1, delta, out); - HuberLossMuls(place, stream, out, delta, out); - } -}; - -template -class HuberLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* residual = ctx.Input("Residual"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto delta = ctx.Attr("delta"); - - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - - phi::DenseTensor t_grad_rd; - if (dx || dy) { - phi::DenseTensor t_zero; - HuberLossZerosLike(place, stream, residual, &t_zero); - HuberLossSmoothL1LossGrad( - place, stream, residual, &t_zero, dout, delta, &t_grad_rd); - } - if (dx) { - HuberLossMuls(place, stream, &t_grad_rd, -delta, dx); - } - if (dy) { - HuberLossMuls(place, stream, &t_grad_rd, delta, dy); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(huber_loss, - ops::HuberLossNPUKernel, - ops::HuberLossNPUKernel); -REGISTER_OP_NPU_KERNEL(huber_loss_grad, - ops::HuberLossGradNPUKernel, - ops::HuberLossGradNPUKernel); diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc deleted file mode 100644 index 7188fe38fdc6801555fb06d4e85a4afcb261ddfc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/increment_op_npu.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class IncrementalNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_tensor = context.Input("X"); - auto* out_tensor = context.Output("Out"); - float step = context.Attr("step"); - out_tensor->mutable_data(context.GetPlace()); - - Tensor step_tensor(x_tensor->dtype()); - - step_tensor.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&step_tensor, static_cast(step)); - - const auto& runner = - NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - increment, - paddle::operators::IncrementalNPUKernel, - paddle::operators::IncrementalNPUKernel, - paddle::operators::IncrementalNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::IncrementalNPUKernel, -#endif - paddle::operators::IncrementalNPUKernel) diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc deleted file mode 100644 index 2a77ff82d0fa316304f0f40f0f67af47c60fe43c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(increment); -USE_OP_DEVICE_KERNEL(increment, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - init.push_back(static_cast(1.0)); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({1}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - f::AttributeMap attr_input = {{"step", static_cast(2.0)}}; - auto op = f::OpRegistry::CreateOp( - "increment", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attr_input); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1); - EXPECT_EQ(out_vec[0], static_cast(3.0)); -} - -TEST(increment, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "increment"); -} - -TEST(increment, NPU_fp64) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "increment"); -} diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc deleted file mode 100644 index 64a50041421b3b7d894c35b0765cc58858170748..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_sample_op_npu.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* index, - const phi::DenseTensor* input, - phi::DenseTensor* out) { - auto index_dims = index->dims(); - auto input_dims = input->dims(); - auto batch_size = input_dims[0]; - auto index_length = index_dims[1]; - - std::vector gather_index_vec; - std::vector index_vec; - framework::TensorToVector(*index, dev_ctx, &index_vec); - for (auto i = 0; i < batch_size; ++i) { - for (auto j = 0; j < index_length; j++) { - gather_index_vec.push_back(i); - gather_index_vec.push_back(index_vec[i * index_length + j]); - } - } - phi::DenseTensor gather_index; - framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index); - gather_index.Resize({batch_size, index_length, 2}); - - NpuOpRunner runner; - runner.SetType("GatherNd") - .AddInput(*input) - .AddInput(gather_index) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); -} - -template -class IndexSampleNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* input = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGather(dev_ctx, index, input, out); - } else { - IndexSampleGather(dev_ctx, index, input, out); - } - } -}; - -template -void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* index, - const phi::DenseTensor* out_grad, - phi::DenseTensor* x_grad) { - auto index_dims = index->dims(); - auto input_dims = x_grad->dims(); - auto batch_size = input_dims[0]; - auto index_length = index_dims[1]; - - std::vector scatter_index_vec; - std::vector index_vec; - framework::TensorToVector(*index, dev_ctx, &index_vec); - for (auto i = 0; i < batch_size; ++i) { - for (auto j = 0; j < index_length; j++) { - scatter_index_vec.push_back(i); - scatter_index_vec.push_back(index_vec[i * index_length + j]); - } - } - phi::DenseTensor scatter_index; - framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index); - scatter_index.Resize({batch_size, index_length, 2}); - - NpuOpRunner runner; - runner.SetType("ScatterNd") - .AddInput(scatter_index) - .AddInput(*out_grad) - .AddInput(phi::vectorize(x_grad->dims())) - .AddOutput(*x_grad); - runner.Run(dev_ctx.stream()); -} - -template -class IndexSampleGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* index = ctx.Input("Index"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - x_grad->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGradScatter(dev_ctx, index, out_grad, x_grad); - } else { - IndexSampleGradScatter(dev_ctx, index, out_grad, x_grad); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(index_sample, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel); -REGISTER_OP_NPU_KERNEL(index_sample_grad, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel); diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc deleted file mode 100644 index dd9c5608a0469d30b1c0a854795096580aa10849..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class IndexSelectNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto dim = ctx.Attr("dim"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(*x) - .AddInput(*index) - .AddInput(std::vector{dim}) - .AddOutput(*out); - runner.Run(stream); - } -}; - -template -class IndexSelectGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x_grad = ctx.Output(framework::GradVarName("X")); - auto* index = ctx.Input("Index"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - - auto stream = - ctx.template device_context() - .stream(); - - auto x_dims = x_grad->dims(); - auto out_dims = out_grad->dims(); - - int dim = ctx.Attr("dim"); - if (dim < 0) { - dim += out_dims.size(); - } - - phi::DenseTensor casted_index; - if (framework::TransToProtoVarType(index->dtype()) != - framework::proto::VarType::INT32) { - casted_index.mutable_data(index->dims(), ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*index}, {casted_index}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(stream); - } else { - casted_index.ShareDataWith(*index); - } - - if (dim == 0) { - x_grad->mutable_data(ctx.GetPlace()); - const auto& zeros_runner = NpuOpRunner("ZerosLike", {*x_grad}, {*x_grad}); - zeros_runner.Run(stream); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(*out_grad) - .AddInput(casted_index) - .AddInput(std::vector{x_dims[dim]}) - .AddOutput(*x_grad); - runner.Run(stream); - } else { - phi::DenseTensor transed_out_grad; - std::vector in_trans_perm; - in_trans_perm.push_back(dim); - for (int i = 0; i < out_dims.size(); ++i) { - if (i == dim) continue; - in_trans_perm.push_back(i); - } - framework::DDim transed_out_dims(out_dims); - for (size_t i = 0; i < in_trans_perm.size(); ++i) { - transed_out_dims[i] = out_dims[in_trans_perm[i]]; - } - transed_out_grad.mutable_data(transed_out_dims, ctx.GetPlace()); - NpuOpRunner in_trans_runner; - in_trans_runner.SetType("Transpose") - .AddInput(*out_grad) - .AddInput(std::move(in_trans_perm)) - .AddOutput(transed_out_grad); - in_trans_runner.Run(stream); - - phi::DenseTensor sum_out; - framework::DDim sum_dims(x_dims); - sum_dims[0] = x_dims[dim]; - auto idx = 1; - for (int i = 0; i < x_dims.size(); ++i) { - if (i == dim) continue; - sum_dims[idx++] = x_dims[i]; - } - sum_out.mutable_data(sum_dims, ctx.GetPlace()); - const auto& zeros_runner = NpuOpRunner("ZerosLike", {sum_out}, {sum_out}); - zeros_runner.Run(stream); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(transed_out_grad) - .AddInput(casted_index) - .AddInput(std::vector{x_dims[dim]}) - .AddOutput(sum_out); - runner.Run(stream); - - std::vector out_trans_perm; - for (int i = 1; i < 1 + dim; ++i) { - out_trans_perm.push_back(i); - } - out_trans_perm.push_back(0); - for (int i = 1 + dim; i < x_dims.size(); ++i) { - out_trans_perm.push_back(i); - } - x_grad->mutable_data(ctx.GetPlace()); - NpuOpRunner out_trans_runner; - out_trans_runner.SetType("Transpose") - .AddInput(sum_out) - .AddInput(std::move(out_trans_perm)) - .AddOutput(*x_grad); - out_trans_runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - index_select, - ops::IndexSelectNPUKernel, - ops::IndexSelectNPUKernel, - ops::IndexSelectNPUKernel); -REGISTER_OP_NPU_KERNEL( - index_select_grad, - ops::IndexSelectGradNPUKernel, - ops::IndexSelectGradNPUKernel, - ops::IndexSelectGradNPUKernel); diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc deleted file mode 100644 index 03307895f09e2319547ad9e8f2a36eab30a55b9f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/instance_norm_op_npu.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class InstanceNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto epsilon = ctx.Attr("epsilon"); - const auto* x = ctx.Input("X"); - const auto* scale = ctx.Input("Scale"); - const auto* bias = ctx.Input("Bias"); - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("SavedMean"); - auto* variance = ctx.Output("SavedVariance"); - auto& dev_ctx = ctx.template device_context(); - - dev_ctx.template Alloc(y); - dev_ctx.template Alloc(mean); - dev_ctx.template Alloc(variance); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - - PADDLE_ENFORCE(x_dims.size() <= 5 && x_dims.size() >= 3, - platform::errors::InvalidArgument( - "InstanceNorm only supports the dimension of input " - " less equal to 5 and greater equal to 3. the dimension " - "of input is %d.", - x_dims.size())); - - auto tmp_x_dims = phi::vectorize(x_dims); - auto tmp_y_dims = phi::vectorize(y_dims); - if (x_dims.size() < 5) { - for (size_t i = x_dims.size(); i < 5; ++i) { - tmp_x_dims.insert(tmp_x_dims.begin() + 2, 1); - tmp_y_dims.insert(tmp_y_dims.begin() + 2, 1); - } - } - - phi::DenseTensor tmp_x, tmp_y; - tmp_x.ShareDataWith(*x); - - tmp_x.Resize(phi::make_ddim(tmp_x_dims)); - tmp_x.set_layout(phi::DataLayout::NCDHW); - tmp_y.ShareDataWith(*y); - tmp_y.Resize(phi::make_ddim(tmp_y_dims)); - tmp_y.set_layout(phi::DataLayout::NCDHW); - - NpuOpRunner runner; - - runner.SetType("InstanceNorm") - .AddInput(tmp_x) - .AddInput(*scale) - .AddInput(*bias) - .AddAttr("data_format", std::string("NCDHW")) - .AddAttr("epsilon", epsilon) - .AddOutput(tmp_y) - .AddOutput(*mean) - .AddOutput(*variance); - runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - instance_norm, - ops::InstanceNormNPUKernel, - ops::InstanceNormNPUKernel); diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc deleted file mode 100644 index 108efafff683f0ffc01b04406ee4ca5bd5d8ec18..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/interpolate_op_npu.cc +++ /dev/null @@ -1,226 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/interpolate_op.h" - -namespace paddle { -namespace operators { -using DataLayout = phi::DataLayout; - -inline static void CheckArgument(const framework::ExecutionContext& ctx) { - const std::string interp_method = ctx.Attr("interp_method"); -#if (CANN_VERSION_CODE < 512000) - bool align_corners = ctx.Attr("align_corners"); - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); -#endif - PADDLE_ENFORCE_EQ( - interp_method, - "nearest", - platform::errors::InvalidArgument( - "NPU Interpolate Kernel only support nearest interpolotion.")); -} - -inline static void ExtractNCHW(const framework::DDim& dims, - const DataLayout& data_layout, - int32_t* n, - int32_t* c, - int32_t* h, - int32_t* w) { - *n = dims[0]; - if (data_layout == DataLayout::kNCHW) { - *c = dims[1]; - *h = dims[2]; - *w = dims[3]; - } else { // kNHWC - *h = dims[1]; - *w = dims[2]; - *c = dims[3]; - } -} - -static void CalcOutSize(const framework::ExecutionContext& ctx, - int32_t in_h, - int32_t in_w, - int32_t* out_h, - int32_t* out_w) { - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - *out_h = ctx.Attr("out_h"); - *out_w = ctx.Attr("out_w"); - - auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - - if (list_new_size_tensor.size() > 0) { - std::vector new_size_h(1); - std::vector new_size_w(1); - framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &new_size_h); - framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &new_size_w); - *out_h = new_size_h[0]; - *out_w = new_size_w[0]; - } else { - float scale; - auto scale_tensor = ctx.Input("Scale"); - if (scale_tensor != nullptr) { - std::vector scale_data; - framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data); - scale = scale_data[0]; - } else { - scale = ctx.Attr("scale"); - } - - if (scale > 0) { - *out_h = static_cast(in_h * scale); - *out_w = static_cast(in_w * scale); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - std::vector out_size_data; - framework::TensorToVector(*out_size, *dev_ctx, &out_size_data); - *out_h = out_size_data[0]; - *out_w = out_size_data[1]; - } - } - - PADDLE_ENFORCE_GT(*out_h, - 0, - platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(*out_w, - 0, - platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); -} - -template -class InterpolateNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // NOTE(Ruibiao): - // this kernel only support nearest interpolotion for 2D images - // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff - // when 'align_corners' is 'true' or data type is 'double' - CheckArgument(ctx); - - auto* input = ctx.Input("X"); - framework::DDim input_dims = input->dims(); - - const std::string data_layout_str = - ctx.Attr("data_layout"); // kNCHW or kNHWC - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - int32_t n, c, h, w, out_h, out_w; - ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); - CalcOutSize(ctx, h, w, &out_h, &out_w); - - // the 'input' tensor may has no set (or wrong set) of the layout - phi::DenseTensor input_x(input->type()); - input_x.ShareDataWith(*input); - input_x.set_layout(data_layout); - - auto* output = ctx.Output("Out"); - framework::DDim output_dims; - if (data_layout == DataLayout::kNCHW) { - output_dims = {n, c, out_h, out_w}; - } else { - output_dims = {n, out_h, out_w, c}; - } - output->set_layout(data_layout); - output->mutable_data(output_dims, ctx.GetPlace()); - - NpuOpRunner npu_op_runner; - auto npu_stream = - ctx.template device_context() - .stream(); - npu_op_runner.SetType("ResizeNearestNeighborV2") - .AddInput(input_x) - .AddInput(std::vector{out_h, out_w}) - .AddOutput(*output) - .AddAttr("align_corners", false) - .AddAttr("half_pixel_centers", false) - .Run(npu_stream); - } -}; - -template -class InterpolateGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // NOTE(Ruibiao): - // this kernel only support nearest interpolotion for 2D images - // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff - // when 'align_corners' is 'true' or data type is 'double' - CheckArgument(ctx); - - auto* input = ctx.Input("X"); - framework::DDim input_dims = input->dims(); - - const std::string data_layout_str = - ctx.Attr("data_layout"); // kNCHW or kNHWC - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - int32_t n, c, h, w, out_h, out_w; - ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); - CalcOutSize(ctx, h, w, &out_h, &out_w); - - // the 'output_grad' tensor may has no set (or wrong set) of the layout - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - phi::DenseTensor output_grad_tmp(output_grad->type()); - output_grad_tmp.ShareDataWith(*output_grad); - output_grad_tmp.set_layout(data_layout); - - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - input_grad->set_layout(data_layout); - framework::DDim input_grad_dims; - if (data_layout == DataLayout::kNCHW) { - input_grad_dims = {n, c, h, w}; - } else { - input_grad_dims = {n, h, w, c}; - } - input_grad->mutable_data(input_grad_dims, ctx.GetPlace()); - - NpuOpRunner npu_op_runner; - auto npu_stream = - ctx.template device_context() - .stream(); - npu_op_runner.SetType("ResizeNearestNeighborV2Grad") - .AddInput(output_grad_tmp) - .AddInput(std::vector{h, w}) - .AddOutput(*input_grad) - .AddAttr("align_corners", false) - .AddAttr("half_pixel_centers", false) - .Run(npu_stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(nearest_interp, - ops::InterpolateNPUKernel, - ops::InterpolateNPUKernel); -REGISTER_OP_NPU_KERNEL(nearest_interp_grad, - ops::InterpolateGradNPUKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc deleted file mode 100644 index d16494f229e42a8e3160766cc22fbcc6c9a0690b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ /dev/null @@ -1,812 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/interpolate_function.h" - -namespace paddle { -namespace operators { - -using DataLayout = phi::DataLayout; -using DDim = framework::DDim; -using fp16 = paddle::platform::float16; - -template -struct InterpolateFunction { - public: - explicit InterpolateFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - t0.mutable_data({1}, place); - t1.mutable_data({1}, place); - tn.mutable_data({1}, place); - FillNpuTensorWithConstant(&t0, static_cast(0)); - FillNpuTensorWithConstant(&t1, static_cast(1)); - } - void Arange(int n, phi::DenseTensor* x) { - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); - runner.Run(stream); - } - void ReduceSum(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - const auto& runner = NpuOpRunner( - "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(y->dtype())); - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); - runner.Run(stream); - } - void Gather(const phi::DenseTensor* x, - const phi::DenseTensor* indices, - const int axis, - phi::DenseTensor* y) { - const auto& runner = - NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}}); - runner.Run(stream); - } - void GatherGrad(const phi::DenseTensor* gy, - const phi::DenseTensor* indices, - const int axis, - phi::DenseTensor* gx) { - // 1 gy swapaxis: axis & 0 - int len = (gy->dims()).size(); - std::vector axis_swap(len); - for (int i = 0; i < len; i++) { - axis_swap[i] = i; - } - axis_swap[0] = axis; - axis_swap[axis] = 0; - auto y_new_shape = gy->dims(); - auto yt = y_new_shape[axis]; - y_new_shape[axis] = y_new_shape[0]; - y_new_shape[0] = yt; - phi::DenseTensor gy_t; - gy_t.mutable_data(y_new_shape, place); - Transpose(gy, &gy_t, axis_swap); - // 2 scatter - auto x_new_shape = gx->dims(); - auto xt = x_new_shape[axis]; - x_new_shape[axis] = x_new_shape[0]; - x_new_shape[0] = xt; - phi::DenseTensor gx_zero, gx_t; - gx_zero.mutable_data(x_new_shape, place); - gx_t.mutable_data(x_new_shape, place); - FillNpuTensorWithConstant(&gx_zero, static_cast(0)); - gx_zero.Resize(x_new_shape); - Scatter(&gx_zero, indices, &gy_t, &gx_t); - // 3 gx swapaxis: axis, 0 - Transpose(&gx_t, gx, axis_swap); - } - void Scatter(const phi::DenseTensor* x, - const phi::DenseTensor* index, - const phi::DenseTensor* updates, - phi::DenseTensor* y) { - const auto& runner = - NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {}); - runner.Run(stream); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Floor(const phi::DenseTensor* x, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {}); - runner.Run(stream); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; - phi::DenseTensor t0; - phi::DenseTensor t1; - phi::DenseTensor tn; -}; - -template <> -void InterpolateFunction::Arange(int n, phi::DenseTensor* x) { - phi::DenseTensor x_fp32(phi::DataType::FLOAT32); - x_fp32.mutable_data(x->dims(), place); - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); - runner.Run(stream); - Cast(&x_fp32, x); -} - -void InterpolateParamCompute(const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout, - const DDim& indim, - const DDim& outdim, - int* axis_h, - int* axis_w, - int* in_h, - int* in_w, - int* out_h, - int* out_w, - float* ratio_h, - float* ratio_w) { - if (data_layout == DataLayout::kNCHW) { - *axis_h = 2; - *axis_w = 3; - } else { - *axis_h = 1; - *axis_w = 2; - } - *out_h = outdim[*axis_h]; - *out_w = outdim[*axis_w]; - *in_h = indim[*axis_h]; - *in_w = indim[*axis_w]; - *ratio_h = 0.0f; - *ratio_w = 0.0f; - if (*out_h > 1) { - *ratio_h = - align_corners - ? static_cast(*in_h - 1) / (*out_h - 1) - : (scale_h > 0 ? 1 / scale_h : static_cast(*in_h) / *out_h); - } - if (*out_w > 1) { - *ratio_w = - align_corners - ? static_cast(*in_w - 1) / (*out_w - 1) - : (scale_w > 0 ? 1 / scale_w : static_cast(*in_w) / *out_w); - } -} - -template -void BilinearParamTensorCompute(const framework::ExecutionContext& ctx, - const DataLayout& data_layout, - int in_h, - int in_w, - int out_h, - int out_w, - bool align_cond, - float ratio_h, - float ratio_w, - phi::DenseTensor* h0, - phi::DenseTensor* h1, - phi::DenseTensor* w0, - phi::DenseTensor* w1, - phi::DenseTensor* coef_h0, - phi::DenseTensor* coef_h1, - phi::DenseTensor* coef_w0, - phi::DenseTensor* coef_w1) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - phi::DenseTensor _h0, _w0; - _h0.mutable_data({out_h}, place); - _w0.mutable_data({out_w}, place); - F.Arange(out_h, &_h0); - F.Arange(out_w, &_w0); - if (align_cond) { - F.Adds(&_h0, static_cast(0.5), &_h0); - F.Adds(&_w0, static_cast(0.5), &_w0); - F.Muls(&_h0, ratio_h, &_h0); - F.Muls(&_w0, ratio_w, &_w0); - F.Adds(&_h0, static_cast(-0.5), &_h0); - F.Adds(&_w0, static_cast(-0.5), &_w0); - } else { - F.Muls(&_h0, ratio_h, &_h0); - F.Muls(&_w0, ratio_w, &_w0); - } - - phi::DenseTensor zero_t; - phi::DenseTensor one_t; - zero_t.mutable_data({1}, place); - one_t.mutable_data({1}, place); - FillNpuTensorWithConstant(&zero_t, static_cast(0)); - FillNpuTensorWithConstant(&one_t, static_cast(1)); - F.Maximum(&_h0, &zero_t, &_h0); - F.Maximum(&_w0, &zero_t, &_w0); - - phi::DenseTensor _h0_floor, _w0_floor; - _h0_floor.mutable_data({out_h}, place); - _w0_floor.mutable_data({out_w}, place); - F.Floor(&_h0, &_h0_floor); - F.Floor(&_w0, &_w0_floor); - F.Cast(&_h0_floor, h0); - F.Cast(&_w0_floor, w0); - - phi::DenseTensor one_int; - one_int.mutable_data({1}, place); - FillNpuTensorWithConstant(&one_int, static_cast(1)); - F.Add(h0, &one_int, h1); - F.Add(w0, &one_int, w1); - phi::DenseTensor t_max_h, t_max_w; - t_max_h.mutable_data({1}, place); - t_max_w.mutable_data({1}, place); - FillNpuTensorWithConstant(&t_max_h, static_cast(in_h - 1)); - FillNpuTensorWithConstant(&t_max_w, static_cast(in_w - 1)); - F.Minimum(h1, &t_max_h, h1); - F.Minimum(w1, &t_max_w, w1); - - F.Sub(&_h0, &_h0_floor, coef_h1); - F.Sub(&_w0, &_w0_floor, coef_w1); - F.Sub(&one_t, coef_h1, coef_h0); - F.Sub(&one_t, coef_w1, coef_w0); - - if (data_layout == DataLayout::kNCHW) { - coef_h0->Resize({out_h, 1}); - coef_h1->Resize({out_h, 1}); - } else { - coef_h0->Resize({out_h, 1, 1}); - coef_h1->Resize({out_h, 1, 1}); - coef_w0->Resize({out_w, 1}); - coef_w1->Resize({out_w, 1}); - } -} - -template -void BilinearFwdNpu(const framework::ExecutionContext& ctx, - const phi::DenseTensor* input, - phi::DenseTensor* output, - const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - auto outdim = output->dims(); - auto indim = input->dims(); - - int axis_h, axis_w; - int out_h, out_w, in_h, in_w; - float ratio_h, ratio_w; - InterpolateParamCompute(scale_h, - scale_w, - align_corners, - align_mode, - data_layout, - indim, - outdim, - &axis_h, - &axis_w, - &in_h, - &in_w, - &out_h, - &out_w, - &ratio_h, - &ratio_w); - - phi::DenseTensor h0, h1, w0, w1; - h0.mutable_data({out_h}, place); - h1.mutable_data({out_h}, place); - w0.mutable_data({out_w}, place); - w1.mutable_data({out_w}, place); - phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1; - coef_h0.mutable_data({out_h}, place); - coef_h1.mutable_data({out_h}, place); - coef_w0.mutable_data({out_w}, place); - coef_w1.mutable_data({out_w}, place); - bool align_cond = align_mode == 0 && !align_corners; - BilinearParamTensorCompute(ctx, - data_layout, - in_h, - in_w, - out_h, - out_w, - align_cond, - ratio_h, - ratio_w, - &h0, - &h1, - &w0, - &w1, - &coef_h0, - &coef_h1, - &coef_w0, - &coef_w1); - - phi::DenseTensor input_gather_h0, input_gather_h1; - auto dim_gather_h = indim; - dim_gather_h[axis_h] = out_h; - input_gather_h0.mutable_data(dim_gather_h, place); - input_gather_h1.mutable_data(dim_gather_h, place); - - F.Gather(input, &h0, axis_h, &input_gather_h0); - F.Gather(input, &h1, axis_h, &input_gather_h1); - - F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0); - F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1); - phi::DenseTensor out_x4; - out_x4.mutable_data({4, outdim[0], outdim[1], outdim[2], outdim[3]}, - place); - phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1); - phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2); - phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3); - phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4); - F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0); - F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1); - F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0); - F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1); - F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0); - F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1); - F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0); - F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1); - F.ReduceSum(&out_x4, output, std::vector{0}, false); -} - -template -void BilinearBwdNpu(const framework::ExecutionContext& ctx, - const phi::DenseTensor* gout, - phi::DenseTensor* gin, - const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - auto outdim = gout->dims(); - auto indim = gin->dims(); - - int axis_h, axis_w; - int out_h, out_w, in_h, in_w; - float ratio_h, ratio_w; - InterpolateParamCompute(scale_h, - scale_w, - align_corners, - align_mode, - data_layout, - indim, - outdim, - &axis_h, - &axis_w, - &in_h, - &in_w, - &out_h, - &out_w, - &ratio_h, - &ratio_w); - - phi::DenseTensor h0, h1, w0, w1; - h0.mutable_data({out_h}, place); - h1.mutable_data({out_h}, place); - w0.mutable_data({out_w}, place); - w1.mutable_data({out_w}, place); - phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1; - coef_h0.mutable_data({out_h}, place); - coef_h1.mutable_data({out_h}, place); - coef_w0.mutable_data({out_w}, place); - coef_w1.mutable_data({out_w}, place); - bool align_cond = align_mode == 0 && !align_corners; - BilinearParamTensorCompute(ctx, - data_layout, - in_h, - in_w, - out_h, - out_w, - align_cond, - ratio_h, - ratio_w, - &h0, - &h1, - &w0, - &w1, - &coef_h0, - &coef_h1, - &coef_w0, - &coef_w1); - - phi::DenseTensor gy_w0, gy_w1; - gy_w0.mutable_data(outdim, place); - gy_w1.mutable_data(outdim, place); - F.Mul(gout, &coef_w0, &gy_w0); - F.Mul(gout, &coef_w1, &gy_w1); - - auto dim_gather_h = indim; - dim_gather_h[axis_h] = out_h; - phi::DenseTensor g_gather_w0, g_gather_w1; - g_gather_w0.mutable_data(dim_gather_h, place); - g_gather_w1.mutable_data(dim_gather_h, place); - w0.Resize({out_w, 1}); - w1.Resize({out_w, 1}); - F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0); - F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1); - - F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0); - F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1); - F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0); - - phi::DenseTensor gx_0, gx_1; - gx_0.mutable_data(indim, place); - gx_1.mutable_data(indim, place); - h0.Resize({out_h, 1}); - h1.Resize({out_h, 1}); - F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0); - F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1); - - F.Add(&gx_0, &gx_1, gin); -} - -template -class InterpolateV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - PADDLE_ENFORCE_EQ( - input_dims.size(), - 4UL, - platform::errors::External( - "NPU Interpolate Kernel only support 4-D phi::DenseTensor.")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - phi::funcs::ExtractNCDWH( - input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - - // To-do(qili93): need to support align_corners = true case, try ReSizeD - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_shape_tensor.size() > 0) { - std::vector output_h(1); - std::vector output_w(1); - auto dev_ctx = - platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - framework::TensorToVector(*list_new_shape_tensor[0], *dev_ctx, &output_h); - framework::TensorToVector(*list_new_shape_tensor[1], *dev_ctx, &output_w); - out_h = output_h[0]; - out_w = output_w[0]; - } else if (ctx.HasInput("OutSize")) { - auto out_size = ctx.Input("OutSize"); - auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = - phi::funcs::get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - } - PADDLE_ENFORCE_GT(out_h, - 0, - platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, - 0, - platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*input, ctx.GetPlace(), output); - return; - } - - auto stream = - ctx.template device_context() - .stream(); - - // To-do(qili93): need to support bilineare, try ResizeD - // Add bilineare by zhulei - if ("nearest" == interp_method) { - NpuOpRunner runner; - runner.SetType("ResizeNearestNeighborV2") - .AddInput(*input) - .AddInput(std::vector{out_h, out_w}) - .AddOutput(*output) - .AddAttr("align_corners", align_corners) - .AddAttr("half_pixel_centers", false); - runner.Run(stream); - } else if ("bilinear" == interp_method) { - int align_mode = ctx.Attr("align_mode"); - BilinearFwdNpu(ctx, - input, - output, - scale_h, - scale_w, - align_corners, - align_mode, - data_layout); - } - } -}; - -template -class InterpolateV2NPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - phi::funcs::ExtractNCDWH( - input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - - // To-do(qili93): need to support align_corners = true case, try ReSizeD - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - std::vector output_h(1); - std::vector output_w(1); - auto dev_ctx = - platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &output_h); - framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &output_w); - out_h = output_h[0]; - out_w = output_w[0]; - } else if (ctx.HasInput("OutSize")) { - auto out_size = ctx.Input("OutSize"); - auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = - phi::funcs::get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_w = scale_data[0]; - scale_h = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); - return; - } - - auto stream = - ctx.template device_context() - .stream(); - - // To-do(qili93): need to support bilineare, try ResizeGradD - if ("nearest" == interp_method) { - NpuOpRunner runner; - runner.SetType("ResizeNearestNeighborV2Grad") - .AddInput(*output_grad) - .AddInput(std::vector{in_h, in_w}) - .AddOutput(*input_grad) - .AddAttr("align_corners", align_corners) - .AddAttr("half_pixel_centers", false); - runner.Run(stream); - } else if ("bilinear" == interp_method) { - int align_mode = ctx.Attr("align_mode"); - BilinearBwdNpu(ctx, - output_grad, - input_grad, - scale_h, - scale_w, - align_corners, - align_mode, - data_layout); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - nearest_interp_v2, - ops::InterpolateV2NPUKernel, - ops::InterpolateV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - nearest_interp_v2_grad, - ops::InterpolateV2NPUGradKernel, - ops::InterpolateV2NPUGradKernel); - -REGISTER_OP_NPU_KERNEL( - bilinear_interp_v2, - ops::InterpolateV2NPUKernel, - ops::InterpolateV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - bilinear_interp_v2_grad, - ops::InterpolateV2NPUGradKernel, - ops::InterpolateV2NPUGradKernel); diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc deleted file mode 100644 index 91a0698d626f550670e0407684b77616f425bcce..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/is_empty_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - is_empty, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc deleted file mode 100644 index d2b4626c58cb47331a6f3754d9f25aa90132c437..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class KLDivLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - loss->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - - if ("none" == reduction) { - // log(label) - auto ones_tensor = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& ones_runner = - NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {}); - ones_runner.Run(stream); - - auto sub_tensor = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& sub_runner = - NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {}); - sub_runner.Run(stream); - - auto log_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& log_runner = - NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {}); - log_runner.Run(stream); - - // log(label) - input - const auto& sub_runner2 = - NpuOpRunner("Sub", {log_target, *input}, {*loss}, {}); - sub_runner2.Run(stream); - - // label * (log(label) - input) - auto min_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - auto max_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - FillNpuTensorWithConstant(&min_value, static_cast(0)); - FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); - - auto cliped_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& clip_runner = NpuOpRunner( - "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); - clip_runner.Run(stream); - - const auto& mul_runner = - NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {}); - mul_runner.Run(stream); - } else if ("batchmean" == reduction || "sum" == reduction) { - const auto& runner = NpuOpRunner( - "KLDiv", {*input, *target}, {*loss}, {{"reduction", reduction}}); - runner.Run(stream); - } else if ("mean" == reduction) { - const auto& runner = NpuOpRunner("KLDiv", - {*input, *target}, - {*loss}, - {{"reduction", std::string("sum")}}); - runner.Run(stream); - - const int numel = input->numel(); - const auto& muls_runner = - NpuOpRunner("Muls", - {*loss}, - {*loss}, - {{"value", static_cast(1.0 / numel)}}); - muls_runner.Run(stream); - } - } -}; - -template -class KLDivLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* target = ctx.Input("Target"); - auto* loss_grad = - ctx.Input(framework::GradVarName("Loss")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - auto reduction = ctx.Attr("reduction"); - input_grad->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - - phi::DenseTensor loss_grad_transformed; - if ("none" == reduction) { - loss_grad_transformed.ShareDataWith(*loss_grad); - } else { - loss_grad_transformed.mutable_data(input_grad->dims(), ctx.GetPlace()); - - NpuOpRunner broadcast_runner; - broadcast_runner.SetType("BroadcastTo"); - broadcast_runner.AddInput(*loss_grad); - broadcast_runner.AddInput(phi::vectorize(input_grad->dims())); - broadcast_runner.AddOutput(loss_grad_transformed); - broadcast_runner.Run(stream); - } - auto min_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - auto max_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - FillNpuTensorWithConstant(&min_value, static_cast(0)); - FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); - - auto cliped_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& clip_runner = NpuOpRunner( - "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); - clip_runner.Run(stream); - - const auto& mul_runner = NpuOpRunner( - "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {}); - mul_runner.Run(stream); - - float k = -1.0f; - - if ("mean" == reduction) { - k = static_cast(-1.0 / input_grad->numel()); - } else if ("batchmean" == reduction) { - k = static_cast(-1.0 / input_grad->dims()[0]); - } - - const auto& muls_runner = - NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}}); - muls_runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(kldiv_loss, - ops::KLDivLossNPUKernel, - ops::KLDivLossNPUKernel); - -REGISTER_OP_NPU_KERNEL(kldiv_loss_grad, - ops::KLDivLossGradNPUKernel, - ops::KLDivLossGradNPUKernel); diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc deleted file mode 100644 index 5c267625f55f74c53694c4f27f3b768fecd44368..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/label_smooth_op_npu.cc +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void LabelSmoothMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in, - float val, - phi::DenseTensor* out) { - out->mutable_data(in->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}}); - runner.Run(stream); -} - -template -void LabelSmoothAdds(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in, - float val, - phi::DenseTensor* out) { - out->mutable_data(in->dims(), place); - const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}}); - runner.Run(stream); -} - -template -void LabelSmoothAddBroadCast(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in1, - const phi::DenseTensor* in2, - phi::DenseTensor* out) { - out->mutable_data(place); - const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {}); - runner.Run(stream); -} - -template -class LabelSmoothNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_t = ctx.Output("Out"); - auto* in_t = ctx.Input("X"); - auto* dist_t = ctx.Input("PriorDist"); - auto epsilon = ctx.Attr("epsilon"); - - auto label_dim = in_t->dims()[in_t->dims().size() - 1]; - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - if (dist_t) { - phi::DenseTensor tmp; - phi::DenseTensor dist; - phi::DenseTensor tmp2; - LabelSmoothMuls(place, stream, in_t, (1 - epsilon), &tmp); - LabelSmoothMuls(place, stream, dist_t, epsilon, &tmp2); - tmp2.Resize({1, label_dim}); - LabelSmoothAddBroadCast(place, stream, &tmp, &tmp2, out_t); - } else { - phi::DenseTensor tmp; - LabelSmoothMuls(place, stream, in_t, (1 - epsilon), &tmp); - LabelSmoothAdds(place, stream, &tmp, (epsilon / label_dim), out_t); - } - } -}; - -template -class LabelSmoothGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_out_t = ctx.Input(framework::GradVarName("Out")); - auto* d_in_t = ctx.Output(framework::GradVarName("X")); - auto epsilon = ctx.Attr("epsilon"); - - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - LabelSmoothMuls(place, stream, d_out_t, 1 - epsilon, d_in_t); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(label_smooth, - ops::LabelSmoothNPUKernel, - ops::LabelSmoothNPUKernel); -REGISTER_OP_NPU_KERNEL(label_smooth_grad, - ops::LabelSmoothGradNPUKernel, - ops::LabelSmoothGradNPUKernel); diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc deleted file mode 100644 index ca6762f2e325a5b5c2b9a1611e07274a0db327f0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ /dev/null @@ -1,449 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using DDim = framework::DDim; - -using DataLayout = phi::DataLayout; - -template -class NormDataType; - -template <> -class NormDataType { - public: - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; -}; - -template <> -class NormDataType { - public: - using ScalingParamType = const float; - using BatchNormParamType = float; -}; - -template -using NormDataType = NormDataType; -template -using LayerNormParamType = typename NormDataType::BatchNormParamType; - -template -class LayerNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using U = LayerNormParamType; - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - const auto epsilon = ctx.Attr("epsilon"); - const auto* x = ctx.Input("X"); - const auto* scale = ctx.Input("Scale"); - const auto* bias = ctx.Input("Bias"); - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* variance = ctx.Output("Variance"); - const auto& x_dims = x->dims(); - std::vector axes; - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int right = static_cast(matrix_dim[1]); - - // The shape of scale and bias should be equal to x.shape[begin_norm_axis:], - // required by Ascend. - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - axes.push_back(x_dims[i]); - } - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor default_scale(x->type()); - if (!scale) { - default_scale.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(1.0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); - runner.Run(stream); - scale = &default_scale; - } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); - } - - phi::DenseTensor default_bias(x->type()); - if (!bias) { - default_bias.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); - runner.Run(stream); - bias = &default_bias; - } else { - const_cast(bias)->Resize(phi::make_ddim(axes)); - } - - // cast scale from LayerNormParamType to T if needed - phi::DenseTensor cast_scale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32) { - cast_scale.Resize(scale->dims()); - cast_scale.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_scale = - NpuOpRunner("Cast", - {*scale}, - {cast_scale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_scale.Run(stream); - } else { - cast_scale.ShareDataWith(*scale); - } - - // cast bias from LayerNormParamType to T if needed - phi::DenseTensor cast_bias(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32) { - cast_bias.Resize(bias->dims()); - cast_bias.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_bias = - NpuOpRunner("Cast", - {*bias}, - {cast_bias}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_bias.Run(stream); - } else { - cast_bias.ShareDataWith(*bias); - } - - y->mutable_data(ctx.GetPlace()); - - // mean should be of U type - phi::DenseTensor* tmp_mean = mean; - phi::DenseTensor cast_mean(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32)) { - cast_mean.Resize(mean->dims()); - cast_mean.mutable_data(ctx.GetPlace()); - tmp_mean = &cast_mean; - mean->mutable_data(ctx.GetPlace()); - } else { - mean->mutable_data(ctx.GetPlace()); - } - - // same for variance - phi::DenseTensor* tmp_variance = variance; - phi::DenseTensor cast_variance(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32)) { - cast_variance.Resize(variance->dims()); - cast_variance.mutable_data(ctx.GetPlace()); - tmp_variance = &cast_variance; - variance->mutable_data(ctx.GetPlace()); - } else { - variance->mutable_data(ctx.GetPlace()); - } - - const auto& runner = NpuOpRunner("LayerNorm", - {*x, cast_scale, cast_bias}, - {*y, *tmp_mean, *tmp_variance}, - {{"begin_norm_axis", begin_norm_axis}, - {"begin_params_axis", begin_norm_axis}, - {"epsilon", epsilon}}); - runner.Run(stream); - - // cast back from FP16 to FP32 - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(mean->type())); - const auto& runner_cast_mean = - NpuOpRunner("Cast", - {*tmp_mean}, - {*mean}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mean.Run(stream); - } - // same for variance - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(variance->type())); - const auto& runner_cast_variance = - NpuOpRunner("Cast", - {*tmp_variance}, - {*variance}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_variance.Run(stream); - } - - // revert shape of scale and bias - // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input - // tensor. - const_cast(scale)->Resize(phi::make_ddim({right})); - const_cast(bias)->Resize(phi::make_ddim({right})); - } -}; - -template -class LayerNormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using U = LayerNormParamType; - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - const auto* x = ctx.Input("X"); - const auto& x_dims = x->dims(); - const auto* mean = ctx.Input("Mean"); - const auto* variance = ctx.Input("Variance"); - const auto* scale = ctx.Input("Scale"); - const auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dscale = - ctx.Output(framework::GradVarName("Scale")); - auto* dbias = ctx.Output(framework::GradVarName("Bias")); - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int right = static_cast(matrix_dim[1]); - - std::vector axes; - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - axes.push_back(x_dims[i]); - } - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - // No need to compute any gradient, jusr return - if (!dx && !dscale && !dbias) { - return; - } - - // The rank of mean should be equal to x, required by Ascend. - std::vector new_shape; - for (auto i = 0; i < begin_norm_axis; ++i) { - new_shape.push_back(x_dims[i]); - } - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - new_shape.push_back(1); - } - - auto mean_dims = mean->dims(); - const_cast(mean)->Resize(phi::make_ddim({new_shape})); - const_cast(variance)->Resize( - phi::make_ddim({new_shape})); - - phi::DenseTensor default_scale(x->type()); - if (!scale) { - default_scale.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(1.0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); - runner.Run(stream); - scale = &default_scale; - } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); - } - - // cast scale from LayerNormParamType to T if needed - phi::DenseTensor cast_scale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32) { - cast_scale.Resize(scale->dims()); - cast_scale.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_scale = - NpuOpRunner("Cast", - {*scale}, - {cast_scale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_scale.Run(stream); - } else { - cast_scale.ShareDataWith(*scale); - } - - // cast mean from LayerNormParamType to T if needed - phi::DenseTensor cast_mean(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32) { - cast_mean.Resize(mean->dims()); - cast_mean.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_mean = - NpuOpRunner("Cast", - {*mean}, - {cast_mean}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mean.Run(stream); - } else { - cast_mean.ShareDataWith(*mean); - } - - // cast variance from LayerNormParamType to T if needed - phi::DenseTensor cast_variance(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32) { - cast_variance.Resize(variance->dims()); - cast_variance.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_variance = - NpuOpRunner("Cast", - {*variance}, - {cast_variance}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_variance.Run(stream); - } else { - cast_variance.ShareDataWith(*variance); - } - - phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type()); - dx = (dx == nullptr) ? &dx_ : dx; - dscale = (dscale == nullptr) ? &dscale_ : dscale; - dbias = (dbias == nullptr) ? &dbias_ : dbias; - - dx->Resize(x->dims()); - dx->mutable_data(ctx.GetPlace()); - - dscale->Resize(phi::make_ddim(axes)); - - dbias->Resize(phi::make_ddim(axes)); - - // dscale should be of U type - phi::DenseTensor* tmp_dscale = dscale; - phi::DenseTensor cast_dscale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32)) { - cast_dscale.Resize(dscale->dims()); - cast_dscale.mutable_data(ctx.GetPlace()); - tmp_dscale = &cast_dscale; - dscale->mutable_data(ctx.GetPlace()); - } else { - dscale->mutable_data(ctx.GetPlace()); - } - - // same for dbias - phi::DenseTensor* tmp_dbias = dbias; - phi::DenseTensor cast_dbias(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32)) { - cast_dbias.Resize(dbias->dims()); - cast_dbias.mutable_data(ctx.GetPlace()); - tmp_dbias = &cast_dbias; - dbias->mutable_data(ctx.GetPlace()); - } else { - dbias->mutable_data(ctx.GetPlace()); - } - - const auto& runner = - NpuOpRunner("LayerNormGrad", - {*dy, *x, cast_variance, cast_mean, cast_scale}, - {*dx, *tmp_dscale, *tmp_dbias}, - {}); - runner.Run(stream); - - // cast back from FP16 to FP32 - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(dscale->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dscale->type())); - const auto& runner_cast_dscale = - NpuOpRunner("Cast", - {*tmp_dscale}, - {*dscale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_dscale.Run(stream); - } - // same for dbias - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(dbias->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dbias->type())); - const auto& runner_cast_dbias = - NpuOpRunner("Cast", - {*tmp_dbias}, - {*dbias}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_dbias.Run(stream); - } - - const_cast(mean)->Resize(mean_dims); - const_cast(variance)->Resize(mean_dims); - const_cast(scale)->Resize(phi::make_ddim({right})); - dscale->Resize(phi::make_ddim({right})); - dbias->Resize(phi::make_ddim({right})); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(layer_norm, - ops::LayerNormNPUKernel, - ops::LayerNormNPUKernel); -REGISTER_OP_NPU_KERNEL(layer_norm_grad, - ops::LayerNormGradNPUKernel, - ops::LayerNormGradNPUKernel); diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc deleted file mode 100644 index 4b9b96c23b0b71cf941c5663ffba0d75dbf4a41a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/load_combine_op_npu.cc +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/load_combine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc deleted file mode 100644 index 0e8517fd7b5296629e8d9ddbdfc0b6831f66eaff..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/load_op_npu.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle { -namespace operators { -template -class LoadOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto place = ctx.GetPlace(); - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - auto filename = ctx.Attr("file_path"); - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fin), - true, - platform::errors::Unavailable( - "Load operator fail to open file %s, please check " - "whether the model file is complete or damaged.", - filename)); - - auto out_var_name = ctx.OutputNames("Out").data(); - auto *out_var = ctx.OutputVar("Out"); - - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::InvalidArgument( - "The variable %s to be loaded cannot be found.", out_var_name)); - - if (out_var->IsType()) { - LoadLodTensor(fin, place, out_var, ctx); - } else if (out_var->IsType()) { - LoadSelectedRows(fin, place, out_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Load operator only supports loading phi::DenseTensor and " - "SelectedRows " - "variable, %s has wrong type", - out_var_name)); - } - } - - void LoadLodTensor(std::istream &fin, - const platform::Place &place, - framework::Variable *var, - const framework::ExecutionContext &ctx) const { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - auto *tensor = var->GetMutable(); - - auto seek = ctx.Attr("seek"); - - if (seek != -1) { - PADDLE_ENFORCE_GE(seek, - 0, - platform::errors::InvalidArgument( - "seek witn tensor must great than or equal to 0")); - auto shape = ctx.Attr>("shape"); - paddle::framework::DeserializeFromStream( - fin, tensor, dev_ctx, seek, shape); - } else { - paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx); - } - - auto load_as_fp16 = ctx.Attr("load_as_fp16"); - auto in_dtype = tensor->dtype(); - auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = - phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, in_dtype); - auto out_kernel_type = - phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, out_dtype); - phi::DenseTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType( - in_kernel_type, out_kernel_type, *tensor, &fp16_tensor); - - // reset output tensor - var->Clear(); - tensor = var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - - void LoadSelectedRows(std::istream &fin, - const platform::Place &place, - framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - load, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel); diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc deleted file mode 100644 index 0eb4ebe2442c1f7ca799a6e98b0940630b2c7373..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void LogLossAdds(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scale, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scale}}); - runner.Run(stream); -} - -template -void LogLossMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scale, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scale}}); - runner.Run(stream); -} - -template -void LogLossBCE(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - z->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("BinaryCrossEntropy", - {*x, *y}, - {*z}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); -} - -template -void LogLossBCEGrad(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - const phi::DenseTensor* dout, - phi::DenseTensor* dx) { - dx->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("BinaryCrossEntropyGrad", - {*x, *y, *dout}, - {*dx}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); -} - -template -class LogLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* y = ctx.Output("Loss"); - auto* pred = ctx.Input("Predicted"); - auto* label = ctx.Input("Labels"); - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - float factor = 1 / (1 + 2 * epsilon); - float coef = std::log(factor); - LogLossAdds(place, stream, pred, epsilon, y); - LogLossMuls(place, stream, y, factor, y); - LogLossBCE(place, stream, y, label, y); - LogLossAdds(place, stream, y, coef, y); - } -}; - -template -class LogLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* pred = ctx.Input("Predicted"); - auto* label = ctx.Input("Labels"); - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = - ctx.Output(framework::GradVarName("Predicted")); - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - if (dpred) { - LogLossBCEGrad(place, stream, pred, label, dloss, dpred); - LogLossMuls(place, stream, dpred, 1 / (1 + 2 * epsilon), dpred); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(log_loss, ops::LogLossNPUKernel); - -REGISTER_OP_NPU_KERNEL(log_loss_grad, ops::LogLossGradNPUKernel); diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc deleted file mode 100644 index 34f9c11e066a75fdc780dc1d87985882f94c56e5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class LogSoftmaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Out = ctx.Output("Out"); - const int rank = X->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - Out->mutable_data(ctx.GetPlace()); - - if (X->numel() != 0) { - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner( - "LogSoftmaxV2", {*X}, {*Out}, {{"axes", std::vector{axis}}}); - runner.Run(stream); - } - } -}; - -template -class LogSoftmaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Out = ctx.Input("Out"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - const int rank = dOut->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(ctx.GetPlace()); - - if (dOut->numel() != 0) { - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("LogSoftmaxGrad", - {*dOut, *Out}, - {*dX}, - {{"axis", std::vector{axis}}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(log_softmax, - ops::LogSoftmaxNPUKernel, - ops::LogSoftmaxNPUKernel); - -REGISTER_OP_NPU_KERNEL(log_softmax_grad, - ops::LogSoftmaxGradNPUKernel, - ops::LogSoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc deleted file mode 100644 index 8ae050541fb2302212079df4dc532505a4f3812a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -constexpr int64_t kNoPadding = -1; - -template -class LookupTableV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids_t = ctx.Input("Ids"); // int tensor - auto *output_t = ctx.Output("Out"); // float tensor - auto *table_t = ctx.Input("W"); - - auto *table_var = ctx.InputVar("W"); - PADDLE_ENFORCE_EQ( - table_var->IsType(), - true, - platform::errors::InvalidArgument("npu only accept phi::DenseTensor")); - output_t->mutable_data(ctx.GetPlace()); - - int64_t padding_idx = ctx.Attr("padding_idx"); - if (padding_idx == kNoPadding) { - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(*table_t) - .AddInput(*ids_t) - .AddInput(std::vector{0}) -#if (CANN_VERSION_CODE >= 503003) - .AddAttrs({{"batch_dims", 0}}) -#endif - .AddOutput(*output_t); - runner.Run(); - } else { - phi::DenseTensor tmp_table_t(table_t->type()); - tmp_table_t.mutable_data(table_t->dims(), ctx.GetPlace()); - - phi::DenseTensor index; - index.mutable_data({1, 1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&index, - static_cast(padding_idx)); - - auto updata_dim = phi::make_ddim({1, table_t->dims()[1]}); - phi::DenseTensor update; - update.mutable_data(updata_dim, ctx.GetPlace()); - FillNpuTensorWithConstant(&update, static_cast(0)); - update.Resize(updata_dim); - - NpuOpRunner update_runner; - update_runner.SetType("TensorScatterUpdate") - .AddInput(*table_t) - .AddInput(index) - .AddInput(update) - .AddOutput(tmp_table_t); - update_runner.Run(); - - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(tmp_table_t) - .AddInput(*ids_t) - .AddInput(std::vector{0}) -#if (CANN_VERSION_CODE >= 503003) - .AddAttrs({{"batch_dims", 0}}) -#endif - .AddOutput(*output_t); - runner.Run(); - } - } -}; - -template -class LookupTableV2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids_t = ctx.Input("Ids"); - auto *output_grad_t = - ctx.Input(framework::GradVarName("Out")); - auto *table_grad_t = - ctx.Output(framework::GradVarName("W")); - table_grad_t->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - int64_t padding_idx = ctx.Attr("padding_idx"); - - /* EmbeddingDenseGrad has bug on large shape, temporarily disable it. - - int embedding_dim = table_grad_t->dims()[1]; - if (embedding_dim % 32 == 0) { - // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in - // EmbeddingDenseGrad must be an integer multiple of 32. - int num_weights = table_grad_t->dims()[0]; - const auto &runner = - NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, - {*table_grad_t}, {{"num_weights", num_weights}, - {"padding_idx", -1}, - {"scale_grad_by_freq", false}}); - runner.Run(stream); - return; - } - */ - - const auto &runner_zeros = - NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); - runner_zeros.Run(stream); - - if (padding_idx == kNoPadding) { - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", - {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, - {{"use_locking", true}}); - runner_scatter.Run(stream); - } else { - phi::DenseTensor casted_ids_t; - if (framework::TransToProtoVarType(ids_t->dtype()) != - framework::proto::VarType::INT32) { - casted_ids_t.mutable_data(ids_t->dims(), ctx.GetPlace()); - const auto &cast_runner = NpuOpRunner( - "Cast", {*ids_t}, {casted_ids_t}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(stream); - } else { - casted_ids_t.ShareDataWith(*ids_t); - } - auto table_grad_dims = table_grad_t->dims(); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(*output_grad_t) - .AddInput(casted_ids_t) - .AddInput(std::vector{table_grad_dims[0]}) - .AddOutput(*table_grad_t); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - lookup_table_v2, - ops::LookupTableV2NPUKernel, - ops::LookupTableV2NPUKernel, - ops::LookupTableV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - lookup_table_v2_grad, - ops::LookupTableV2GradNPUKernel, - ops::LookupTableV2GradNPUKernel, - ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc deleted file mode 100644 index 96fba4b968869c327e46ce785eb32745705e875b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/masked_select_op_npu.cc +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MaskedSelectedNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto mask = ctx.Input("Mask"); - auto out = ctx.Output("Y"); - - auto input_dim = input->dims(); - auto mask_dim = mask->dims(); - PADDLE_ENFORCE_EQ( - input_dim, - mask_dim, - platform::errors::InvalidArgument( - "The dim size of input and mask in OP(masked_selected) " - "must be equal, but got input dim:(%ld), mask dim: " - "(%ld). Please check input " - "value.", - input_dim, - mask_dim)); - - auto& dev_ctx = - ctx.template device_context(); - auto stream = dev_ctx.stream(); - - Tensor mask_int32, out_size; - std::vector out_size_vec; - mask_int32.mutable_data(mask->dims(), ctx.GetPlace()); - out_size.mutable_data({1}, ctx.GetPlace()); - { - const auto& cast_runner = NpuOpRunner( - "Cast", - {*mask}, - {mask_int32}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::proto::VarType::INT32))}}); - cast_runner.Run(stream); - - mask_int32.Resize({mask_int32.numel()}); - NpuOpRunner sum_runner; - sum_runner.SetType("ReduceSum"); - sum_runner.AddInput(mask_int32); - sum_runner.AddInput(std::vector({0})); - sum_runner.AddOutput(out_size); - sum_runner.AddAttr("keep_dims", false); - sum_runner.Run(stream); - paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); - } - - out->Resize({out_size_vec[0]}); - out->mutable_data(ctx.GetPlace()); - - Tensor topkv2_out, indices; - topkv2_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - indices.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - { - NpuOpRunner topkv2_runner; - topkv2_runner.SetType("TopKV2") - .AddInput(mask_int32) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", false) - .AddAttr("dim", 0) - .AddAttr("largest", true) - .Run(stream); - // TopKV2 may be unstable - NpuOpRunner topkv2_runner2; - topkv2_runner2.SetType("TopKV2") - .AddInput(indices) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", true) - .AddAttr("dim", 0) - .AddAttr("largest", false) - .Run(stream); - - Tensor input_tmp; - input_tmp.ShareDataWith(*input); - input_tmp.Resize({input->numel()}); - const auto& gather_runner = NpuOpRunner( - "GatherV2D", {input_tmp, topkv2_out}, {*out}, {{"axis", 0}}); - gather_runner.Run(stream); - } - } -}; - -template -class MaskedSelectedGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto mask = ctx.Input("Mask"); - auto y_grad = ctx.Input(framework::GradVarName("Y")); - auto x_grad = ctx.Output(framework::GradVarName("X")); - - x_grad->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = - ctx.template device_context(); - auto stream = dev_ctx.stream(); - - Tensor mask_int32, out_size; - std::vector out_size_vec; - mask_int32.mutable_data(mask->dims(), ctx.GetPlace()); - out_size.mutable_data({1}, ctx.GetPlace()); - { - const auto& cast_runner = NpuOpRunner( - "Cast", - {*mask}, - {mask_int32}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::proto::VarType::INT32))}}); - cast_runner.Run(stream); - - mask_int32.Resize({mask_int32.numel()}); - NpuOpRunner sum_runner; - sum_runner.SetType("ReduceSum"); - sum_runner.AddInput(mask_int32); - sum_runner.AddInput(std::vector({0})); - sum_runner.AddOutput(out_size); - sum_runner.AddAttr("keep_dims", false); - sum_runner.Run(stream); - paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); - } - - Tensor topkv2_out, indices; - topkv2_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - indices.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - { - NpuOpRunner topkv2_runner; - topkv2_runner.SetType("TopKV2") - .AddInput(mask_int32) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", false) - .AddAttr("dim", 0) - .AddAttr("largest", true) - .Run(stream); - - NpuOpRunner topkv2_runner2; - topkv2_runner2.SetType("TopKV2") - .AddInput(indices) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", true) - .AddAttr("dim", 0) - .AddAttr("largest", false) - .Run(stream); - - topkv2_out.Resize({out_size_vec[0], 1}); - x_grad->Resize({x_grad->numel()}); - NpuOpRunner scatter_runner; - scatter_runner.SetType("ScatterNd"); - scatter_runner.AddInput(topkv2_out); - scatter_runner.AddInput(*y_grad); - scatter_runner.AddInput( - std::vector({static_cast(x_grad->numel())})); - scatter_runner.AddOutput(*x_grad); - scatter_runner.Run(stream); - x_grad->Resize(mask->dims()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(masked_select, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel); -REGISTER_OP_NPU_KERNEL(masked_select_grad, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel); diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc deleted file mode 100644 index d49d9a319ccffac180f87a2a7f6473b6e64bcc78..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matmul_op_npu.cc +++ /dev/null @@ -1,561 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void Mul(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {}); - runner_dx.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {}); - runner_dx.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void Dot(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void MatMul2D(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = - NpuOpRunner("MatMul", - {X, Y}, - {*Out}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = - NpuOpRunner("MatMul", - {X, Y}, - {Out_temp}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = - NpuOpRunner("BatchMatMul", - {X, Y}, - {*Out}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = - NpuOpRunner("BatchMatMul", - {X, Y}, - {Out_temp}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void ReduceDims(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const std::vector& dims, - const std::vector& brd_dims, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - std::vector axes; - int64_t size = brd_dims.size(); - int64_t diff = brd_dims.size() - dims.size(); - for (int64_t i = 0; i < size; ++i) { - if (i < diff) { - axes.push_back(i); - continue; - } - if (brd_dims[i] > dims[i - diff]) { - axes.push_back(i); - } - } - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); -} - -template -class MatMulNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - float alpha = static_cast(ctx.Attr("alpha")); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(Out->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - PADDLE_ENFORCE_EQ( - X->numel(), - Y->numel(), - platform::errors::InvalidArgument( - "X's numbers must be equal to Y's numbers," - "when X/Y's dims =1. But received X has [%d] elements," - "received Y has [%d] elements", - X->numel(), - Y->numel())); - Out->Resize({1}); - Dot(ctx, stream, *X, *Y, Out, alpha); - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - y_ndim = 2; - out_ndim += 1; - } - - const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - if (transpose_y) { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 1], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 1, - K, - y_ndim - 1, - y_dims[y_ndim - 1])); - } else { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 2], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 2, - K, - y_ndim - 2, - y_dims[y_ndim - 2])); - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (x_ndim == 2 && y_ndim == 2) { - MatMul2D( - ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); - return; - } - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (transpose_x == false && y_ndim == 2) { - std::vector vec_dim = {x_temp.numel() / K, K}; - x_temp.Resize(phi::make_ddim(vec_dim)); - MatMul2D( - ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->dtype()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->dtype()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - MatMulND(ctx, - stream, - x_temp_brd, - y_temp_brd, - Out, - transpose_x, - transpose_y, - alpha); - } -}; - -template -class MatMulGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - float alpha = static_cast(ctx.Attr("alpha")); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(dOut->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - phi::DenseTensor dout_temp(dOut->dtype()); - dout_temp.Resize(X->dims()); - dout_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(*dOut) - .AddInput(std::move(x_dims)) - .AddOutput(dout_temp) - .Run(stream); - - if (dX) { - Mul(ctx, stream, dout_temp, *Y, dX, alpha); - } - if (dY) { - Mul(ctx, stream, dout_temp, *X, dY, alpha); - } - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp, dout_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - dout_temp.ShareDataWith(*dOut); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - y_ndim = 2; - out_ndim += 1; - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (out_ndim == 2) { - if (dX) { - dX->Resize(phi::make_ddim(x_dims)); - if (transpose_x) { - MatMul2D( - ctx, stream, y_temp, dout_temp, dX, transpose_y, true, alpha); - } else { - MatMul2D( - ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha); - } - dX->Resize(X->dims()); - } - if (dY) { - dY->Resize(phi::make_ddim(y_dims)); - if (transpose_y) { - MatMul2D( - ctx, stream, dout_temp, x_temp, dY, true, transpose_x, alpha); - } else { - MatMul2D( - ctx, stream, x_temp, dout_temp, dY, !transpose_x, false, alpha); - } - dY->Resize(Y->dims()); - } - return; - } - - const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (transpose_x == false && y_ndim == 2) { - std::vector x_vec_dim = {x_temp.numel() / K, K}; - dout_temp.Resize( - phi::make_ddim(std::vector{dout_temp.numel() / N, N})); - if (dX) { - dX->Resize(phi::make_ddim(x_vec_dim)); - MatMul2D( - ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha); - dX->Resize(X->dims()); - } - if (dY) { - x_temp.Resize(phi::make_ddim(x_vec_dim)); - if (transpose_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false, alpha); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false, alpha); - } - } - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->dtype()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->dtype()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - - if (dX) { - if (x_dims == x_broadcast_dims) { - if (transpose_x) { - MatMulND( - ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true, alpha); - } else { - MatMulND(ctx, - stream, - dout_temp, - y_temp_brd, - dX, - false, - !transpose_y, - alpha); - } - } else { - phi::DenseTensor dx_temp(X->dtype()); - dx_temp.Resize(phi::make_ddim(x_broadcast_dims)); - if (transpose_x) { - MatMulND(ctx, - stream, - y_temp_brd, - dout_temp, - &dx_temp, - transpose_y, - true, - alpha); - } else { - MatMulND(ctx, - stream, - dout_temp, - y_temp_brd, - &dx_temp, - false, - !transpose_y, - alpha); - } - ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); - } - } - if (dY) { - if (y_dims == y_broadcast_dims) { - if (transpose_y) { - MatMulND( - ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x, alpha); - } else { - MatMulND(ctx, - stream, - x_temp_brd, - dout_temp, - dY, - !transpose_x, - false, - alpha); - } - } else { - phi::DenseTensor dy_temp(Y->dtype()); - dy_temp.Resize(phi::make_ddim(y_broadcast_dims)); - if (transpose_y) { - MatMulND(ctx, - stream, - dout_temp, - x_temp_brd, - &dy_temp, - true, - transpose_x, - alpha); - } else { - MatMulND(ctx, - stream, - x_temp_brd, - dout_temp, - &dy_temp, - !transpose_x, - false, - alpha); - } - ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - matmul, - ops::MatMulNPUKernel, - ops::MatMulNPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_grad, - ops::MatMulGradNPUKernel, - ops::MatMulGradNPUKernel); diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc deleted file mode 100644 index 2a398fbb5499bf42b9246b8eee909465be0d9901..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ /dev/null @@ -1,480 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/matmul_v2_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void MatMul2D(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("MatMul", - {X, Y}, - {*Out}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - runner.Run(stream); -} - -template -static void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("BatchMatMul", - {X, Y}, - {*Out}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); -} - -#if (CANN_VERSION_CODE < 504000) -template <> -void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - phi::DenseTensor x_fp32, y_fp32, out_fp32; - x_fp32.Resize(X.dims()); - y_fp32.Resize(Y.dims()); - out_fp32.Resize(Out->dims()); - x_fp32.mutable_data(ctx.GetPlace()); - y_fp32.mutable_data(ctx.GetPlace()); - out_fp32.mutable_data(ctx.GetPlace()); - - const auto& cast_x = - NpuOpRunner("Cast", - {X}, - {x_fp32}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(x_fp32.type())))}}); - cast_x.Run(stream); - const auto& cast_y = - NpuOpRunner("Cast", - {Y}, - {y_fp32}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(y_fp32.type())))}}); - cast_y.Run(stream); - - const auto& runner = NpuOpRunner("BatchMatMul", - {x_fp32, y_fp32}, - {out_fp32}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); - - const auto& cast_out = NpuOpRunner( - "Cast", - {out_fp32}, - {*Out}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}}); - cast_out.Run(stream); -} -#endif - -template -static void ReduceDims(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const std::vector& dims, - const std::vector& brd_dims, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - std::vector axes; - int64_t size = brd_dims.size(); - int64_t diff = brd_dims.size() - dims.size(); - for (int64_t i = 0; i < size; ++i) { - if (i < diff) { - axes.push_back(i); - continue; - } - if (brd_dims[i] > dims[i - diff]) { - axes.push_back(i); - } - } - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); -} - -template -class MatMulV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); - const bool trans_x = ctx.Attr("trans_x"); - const bool trans_y = ctx.Attr("trans_y"); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(Out->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - PADDLE_ENFORCE_EQ( - X->numel(), - Y->numel(), - platform::errors::InvalidArgument( - "X's numbers must be equal to Y's numbers," - "when X/Y's dims =1. But received X has [%d] elements," - "received Y has [%d] elements", - X->numel(), - Y->numel())); - Out->Resize({1}); - Out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); - runner.Run(stream); - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - y_ndim = 2; - out_ndim += 1; - } - - const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - if (trans_y) { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 1], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 1, - K, - y_ndim - 1, - y_dims[y_ndim - 1])); - } else { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 2], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 2, - K, - y_ndim - 2, - y_dims[y_ndim - 2])); - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (x_ndim == 2 && y_ndim == 2) { - MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); - return; - } - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (trans_x == false && y_ndim == 2) { - std::vector vec_dim = {x_temp.numel() / K, K}; - x_temp.Resize(phi::make_ddim(vec_dim)); - MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->type()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->type()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); - } -}; - -template -class MatMulV2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - const bool trans_x = ctx.Attr("trans_x"); - const bool trans_y = ctx.Attr("trans_y"); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(dOut->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - phi::DenseTensor dout_temp(dOut->type()); - dout_temp.Resize(X->dims()); - dout_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(*dOut) - .AddInput(std::move(x_dims)) - .AddOutput(dout_temp) - .Run(stream); - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); - runner_dx.Run(stream); - } - if (dY) { - dY->mutable_data(ctx.GetPlace()); - const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); - runner_dy.Run(stream); - } - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp, dout_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - dout_temp.ShareDataWith(*dOut); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - y_ndim = 2; - out_ndim += 1; - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (out_ndim == 2) { - if (dX) { - dX->Resize(phi::make_ddim(x_dims)); - if (trans_x) { - MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); - } else { - MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); - } - dX->Resize(X->dims()); - } - if (dY) { - dY->Resize(phi::make_ddim(y_dims)); - if (trans_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); - } - dY->Resize(Y->dims()); - } - return; - } - - const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (trans_x == false && y_ndim == 2) { - std::vector x_vec_dim = {x_temp.numel() / K, K}; - dout_temp.Resize( - phi::make_ddim(std::vector{dout_temp.numel() / N, N})); - if (dX) { - dX->Resize(phi::make_ddim(x_vec_dim)); - MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); - dX->Resize(X->dims()); - } - if (dY) { - x_temp.Resize(phi::make_ddim(x_vec_dim)); - if (trans_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); - } - } - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->type()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->type()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - - if (dX) { - if (x_dims == x_broadcast_dims) { - if (trans_x) { - MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); - } else { - MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); - } - } else { - phi::DenseTensor dx_temp(X->type()); - dx_temp.Resize(phi::make_ddim(x_broadcast_dims)); - if (trans_x) { - MatMulND( - ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, true); - } else { - MatMulND( - ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, !trans_y); - } - ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); - } - } - if (dY) { - if (y_dims == y_broadcast_dims) { - if (trans_y) { - MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); - } else { - MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); - } - } else { - phi::DenseTensor dy_temp(Y->type()); - dy_temp.Resize(phi::make_ddim(y_broadcast_dims)); - if (trans_y) { - MatMulND( - ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, trans_x); - } else { - MatMulND( - ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, false); - } - ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL(matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc deleted file mode 100644 index 3df6a6a04d54135663ef64aa55d5f8bf6719cb79..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mean_op_npu.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -class MeanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - std::vector axes; - - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class MeanGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto stream = - context.template device_context() - .stream(); - - auto grad = context.Input(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ( - grad->numel(), - 1, - platform::errors::InvalidArgument( - "Mean Gradient Input phi::DenseTensor len should be 1. But " - "received Out@Grad's elements num is %d.", - grad->numel())); - - auto IG = context.Output(framework::GradVarName("X")); - IG->mutable_data(context.GetPlace()); - - // ones - phi::DenseTensor ones(grad->dtype()); - ones.mutable_data(IG->dims(), context.GetPlace()); - const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); - runner_ones.Run(stream); - - // means - phi::DenseTensor mean_tensor(grad->dtype()); - mean_tensor.Resize({1}); - mean_tensor.mutable_data(context.GetPlace()); - FillNpuTensorWithConstant( - &mean_tensor, static_cast(1.0 / static_cast(IG->numel()))); - - // means mul ones - phi::DenseTensor mean_ma(grad->dtype()); - mean_ma.Resize(IG->dims()); - mean_ma.mutable_data(context.GetPlace()); - const auto& runner_mul_1 = - NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); - runner_mul_1.Run(stream); - - // and mul grad - const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); - runner_mul_2.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - mean, - ops::MeanNPUKernel, - ops::MeanNPUKernel) - -REGISTER_OP_NPU_KERNEL( - mean_grad, - ops::MeanGradNPUKernel, - ops::MeanGradNPUKernel) diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc deleted file mode 100644 index e60af8bd480ea8112ce7944a2a8a45cb7da04ced..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/meshgrid_op_npu.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MeshgridNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto ins = context.MultiInput("X"); - auto outs = context.MultiOutput("Out"); - PADDLE_ENFORCE_EQ( - (ins.size() > 1) && (ins.size() < 7), - true, - platform::errors::InvalidArgument( - "Excepted Tensor numbers between 2 and 6, but only received d% .", - ins.size())); - - int64_t size = ins.size(); - std::vector shape(size); - - for (int64_t i = 0; i < size; i++) { - switch (ins[i]->dims().size()) { - case 0: - shape[i] = 1; - break; - case 1: - shape[i] = ins[i]->dims()[0]; - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected scalar or 1D tensor in the tensor list but got tensor " - "%d: ", - i)); - } - } - - for (int64_t i = 0; i < size; i++) { - std::vector view_shape(size, 1); - view_shape[i] = shape[i]; - - framework::DDim out_dims_reshape = phi::make_ddim(view_shape); - phi::DenseTensor reshape_ins_tensor(ins[i]->dtype()); - reshape_ins_tensor.ShareDataWith(*ins[i]); - reshape_ins_tensor.Resize(out_dims_reshape); - - framework::DDim out_dims = phi::make_ddim(shape); - outs[i]->Resize(out_dims); - outs[i]->mutable_data(context.GetPlace()); - - auto stream = - context.template device_context() - .stream(); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(reshape_ins_tensor) - .AddInput(std::move(shape)) - .AddOutput(*(outs[i])) - .Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - meshgrid, - paddle::operators::MeshgridNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::MeshgridNPUKernel, -#endif - paddle::operators::MeshgridNPUKernel, - paddle::operators::MeshgridNPUKernel); diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc deleted file mode 100644 index d8b713de96fff4cd96376f374356c526cc82d23d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mul_op_npu.cc +++ /dev/null @@ -1,274 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MulNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - int x_num_col_dims = ctx.Attr("x_num_col_dims"); - int y_num_col_dims = ctx.Attr("y_num_col_dims"); - auto stream = - ctx.template device_context() - .stream(); - if (x_num_col_dims == 1 && y_num_col_dims == 1) { - if (x->dims().size() == 2 && y->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("MatMul", - {*x, *y}, - {*out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner.Run(stream); - } else if (x->dims().size() >= 3 && y->dims().size() == 2) { - // reshape - Tensor tmp_x(x->type()); - int64_t sec_dim = x->dims()[1]; - for (auto i = 2; i < x->dims().size(); i++) { - sec_dim *= x->dims()[i]; - } - int64_t first_dim = x->dims()[0]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - out->mutable_data(ctx.GetPlace()); - // matmul - const auto& runner = - NpuOpRunner("MatMul", - {tmp_x, *y}, - {*out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner.Run(stream); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("npu error: not support dims")); - } - // to do other - } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] - PADDLE_ENFORCE_EQ(x_num_col_dims, - 2, - platform::errors::InvalidArgument( - "now only support x_num_col_dims == 2: but got %d", - x_num_col_dims)); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(y->dtype()) == - framework::proto::VarType::FP16) { - // NOTE: When the dim of the input and output shapes is inconsistent, - // (Boradcast) BatchMatMul NPU OP only support FP16. - out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("BatchMatMul", - {*x, *y}, - {*out}, - {{"adj_x1", false}, {"adj_x2", false}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - // flatten => x.shape=[6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - - // matmul [6,4] , [4, 5] => [6, 5] - out->mutable_data(ctx.GetPlace()); - - Tensor tmp_out(x->type()); - tmp_out.ShareDataWith(*out); - tmp_out.Resize(phi::make_ddim({first_dim, y->dims()[1]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {tmp_x, *y}, - {tmp_out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner_matmul.Run(stream); - } - } - } -}; - -template -class MulGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int x_num_col_dims = ctx.Attr("x_num_col_dims"); - int y_num_col_dims = ctx.Attr("y_num_col_dims"); - auto stream = - ctx.template device_context() - .stream(); - if (x_num_col_dims == 1 && y_num_col_dims == 1) { - if (x->dims().size() == 2 && y->dims().size() == 2) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", - {*dout, *y}, - {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - - runner_dx.Run(stream); - } - - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {*x, *dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - - runner_dy.Run(stream); - } - } else if (x->dims().size() >= 3 && y->dims().size() == 2) { - // flatten => x.shape=[6, 4] - // matmul - if (dx) { - // matmul [2, 5] * [12, 5] => [2, 12] - dx->mutable_data(ctx.GetPlace()); - Tensor tmp_dx(x->type()); - tmp_dx.ShareDataWith(*dx); - tmp_dx.Resize(phi::make_ddim({dout->dims()[0], y->dims()[0]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {*dout, *y}, - {tmp_dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_matmul.Run(stream); - } - - if (dy) { - // flatten - Tensor tmp_x(x->type()); - int64_t sec_dim = x->dims()[1]; - for (auto i = 2; i < x->dims().size(); i++) { - sec_dim *= x->dims()[i]; - } - int64_t first_dim = x->dims()[0]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {tmp_x, *dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - - runner_dy.Run(stream); - } - } - } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] - PADDLE_ENFORCE_EQ(x_num_col_dims, - 2, - platform::errors::InvalidArgument( - "now only support x_num_col_dims == 2: but got %d", - x_num_col_dims)); - // tmp_dout both used by dx and dy - Tensor tmp_dout(x->type()); - int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; - int64_t dout_sec_dim = dout->dims()[2]; - tmp_dout.ShareDataWith(*dout); - tmp_dout.Resize(phi::make_ddim({dout_first_dim, dout_sec_dim})); - - if (dx) { - // tmp_dout * y [2, 3, 5] * [4,5] => [2, 3, 4] - if (framework::TransToProtoVarType(dout->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(y->dtype()) == - framework::proto::VarType::FP16) { - // NOTE: When the dim of the input and output shapes is inconsistent, - // (Boradcast) BatchMatMul NPU OP only support FP16. - dx->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("BatchMatMul", - {*dout, *y}, - {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - dx->mutable_data(ctx.GetPlace()); - Tensor tmp_dx(x->type()); - tmp_dx.ShareDataWith(*dx); - tmp_dx.Resize(phi::make_ddim({dout_first_dim, y->dims()[0]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {tmp_dout, *y}, - {tmp_dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_matmul.Run(stream); - } - } - if (dy) { - // flatten x.shape [2,3,4] => [6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - // mamtul [6,4] [6,5] =>[4,5] - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {tmp_x, tmp_dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - mul, - ops::MulNPUKernel, - ops::MulNPUKernel); -REGISTER_OP_NPU_KERNEL( - mul_grad, - ops::MulGradNPUKernel, - ops::MulGradNPUKernel); diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc deleted file mode 100644 index 425b7c6738633d47b0b0ffcc396d788e8f709a5c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/multinomial_op_npu.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in -// cmake/operators.cmake when Paddle supports -#if (CANN_VERSION_CODE >= 504000) - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class NPUMultinomialKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); - const int64_t num_samples = ctx.Attr("num_samples"); - const bool replacement = ctx.Attr("replacement"); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - out->mutable_data(place); - - const auto& runner = NpuOpRunner( - "MultinomialWithReplacementD", - {*x}, - {*out}, - {{"num_samples", num_samples}, {"replacement", replacement}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - multinomial, - ops::NPUMultinomialKernel, - ops::NPUMultinomialKernel) -#endif diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc deleted file mode 100644 index b839b3e8ec2e0f196677269f33b2b0d161a6ca45..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/norm_op_npu.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using DDim = framework::DDim; - -void CheckAxis(int axis, int rank) { - // check the axis is in [-rank, rank-1] - if (axis <= rank - 1 && axis >= -rank) return; - PADDLE_THROW(platform::errors::InvalidArgument( - "axis in norm operator must between (%d) and (%d)" - "but got (%d).", - -rank, - rank - 1, - axis)); -} - -template -class NormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl; - auto *in_x = ctx.Input("X"); - auto *out_y = ctx.Output("Out"); - auto *out_norm = ctx.Output("Norm"); - out_y->mutable_data(ctx.GetPlace()); - out_norm->mutable_data(ctx.GetPlace()); - auto xdim = in_x->dims(); - float eps = ctx.Attr("epsilon"); - int axis = ctx.Attr("axis"); - CheckAxis(axis, xdim.size()); - if (axis < 0) axis = xdim.size() + axis; - - framework::NPUAttributeMap attr_input_norm; - attr_input_norm["axes"] = std::vector({axis}); - attr_input_norm["p"] = 2; - attr_input_norm["keepdim"] = true; - attr_input_norm["epsilon"] = eps; - const auto &runner = - NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - NpuOpRunner("Div", {*in_x, *out_norm}, {*out_y}, {}).Run(stream); - } -}; - -template -class NormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - float epsilon = ctx.Attr("epsilon"); - int axis = ctx.Attr("axis"); - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Out"); - auto *dy = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - - auto xdim = x->dims(); - CheckAxis(axis, xdim.size()); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - framework::NPUAttributeMap attr_input_norm; - attr_input_norm["dim"] = std::vector({axis}); - attr_input_norm["eps"] = epsilon; - const auto &runner = - NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - norm, - ops::NormNPUKernel, - ops::NormNPUKernel) - -REGISTER_OP_NPU_KERNEL( - norm_grad, - ops::NormGradNPUKernel, - ops::NormGradNPUKernel); diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc deleted file mode 100644 index e44f6286afa9ba79a91c185da0c4d7c411d15d2e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/one_hot_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/one_hot_op.h" - -namespace paddle { -namespace operators { - -template -class OneHotNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* in = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int depth = ctx.Attr("depth"); - - if (ctx.HasInput("depth_tensor")) { - auto* depth_tensor = ctx.Input("depth_tensor"); - std::vector depth_data; - framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data); - depth = depth_data[0]; - auto in_dims = in->dims(); - framework::DDim out_dims(in_dims); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - out->mutable_data(ctx.GetPlace()); - - float on_value = 1.0f, off_value = 0.0f; - if (framework::TransToProtoVarType(in->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(*in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } else { - phi::DenseTensor transformed_in; - transformed_in.mutable_data(in->dims(), dev_ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(dev_ctx.stream()); - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(transformed_in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(one_hot, - ops::OneHotNPUKernel, - ops::OneHotNPUKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc deleted file mode 100644 index b213d3345d1f0c3bc5f583823b8fce23ac80b229..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class OneHotV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* in = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int depth = ctx.Attr("depth"); - - if (ctx.HasInput("depth_tensor")) { - auto* depth_tensor = ctx.Input("depth_tensor"); - std::vector depth_data; - framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data); - depth = depth_data[0]; - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - out->mutable_data(ctx.GetPlace()); - - float on_value = 1.0f, off_value = 0.0f; - if (framework::TransToProtoVarType(in->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(*in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } else { - phi::DenseTensor transformed_in; - transformed_in.mutable_data(in->dims(), dev_ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(dev_ctx.stream()); - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(transformed_in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(one_hot_v2, - ops::OneHotV2NPUKernel, - ops::OneHotV2NPUKernel); diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc deleted file mode 100644 index c2d99fa42f2f8be73497b9947d75d05149c47b07..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/p_norm_op_npu.cc +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class PnormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* out_norm = ctx.Output("Out"); - out_norm->mutable_data(ctx.GetPlace()); - - float porder = ctx.Attr("porder"); - int axis = ctx.Attr("axis"); - bool keepdim = ctx.Attr("keepdim"); - - auto xdim = in_x->dims(); - if (axis < 0) axis = xdim.size() + axis; - - auto stream = - ctx.template device_context() - .stream(); - - int p = 0; - bool combine_op = - !(porder == 0 || porder == INFINITY || porder == -INFINITY); - if (porder == INFINITY) { - p = INT_MAX; - } else if (porder == -INFINITY) { - p = INT_MIN; - } else { - p = static_cast(porder); - float t = 0; - float diff = abs(std::modf(porder, &t)); - if (diff < 1e-5) { - combine_op = false; - } - } - - if (!combine_op) { - const auto& runner = NpuOpRunner("LpNorm", - {*in_x}, - {*out_norm}, - {{"p", p}, - {"axes", std::vector({axis})}, - {"keep_dims", keepdim}}); - runner.Run(stream); - } else { - phi::DenseTensor tmp_x; - tmp_x.mutable_data(xdim, ctx.GetPlace()); - - const auto& power_runner1 = - NpuOpRunner("Power", - {*in_x}, - {tmp_x}, - {{"power", porder}, {"scale", 1.0f}, {"shift", 0.0f}}); - power_runner1.Run(stream); - - const auto& reduce_runner = NpuOpRunner( - "ReduceSumD", - {tmp_x}, - {*out_norm}, - {{"axes", std::vector({axis})}, {"keep_dims", keepdim}}); - reduce_runner.Run(stream); - - const auto& power_runner2 = NpuOpRunner( - "Power", - {*out_norm}, - {*out_norm}, - {{"power", 1 / porder}, {"scale", 1.0f}, {"shift", 0.0f}}); - power_runner2.Run(stream); - } - } -}; - -template -class PnormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Out"); - auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - dx->mutable_data(place); - - auto xdim = x->dims(); - float porder = ctx.Attr("porder"); - bool keepdim = ctx.Attr("keepdim"); - - int axis = ctx.Attr("axis"); - axis = axis < 0 ? xdim.size() + axis : axis; - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor y_share(y->type()); - phi::DenseTensor dy_share(dy->type()); - y_share.ShareDataWith(*y); - dy_share.ShareDataWith(*dy); - auto ydim = xdim; - if (!keepdim) { - ydim[axis] = 1; - } else { - ydim = y->dims(); - } - y_share.Resize(ydim); - dy_share.Resize(ydim); - - if (porder == 0) { - FillNpuTensorWithConstant(dx, static_cast(0)); - dx->Resize(xdim); - } else if (porder == INFINITY || porder == -INFINITY) { - phi::DenseTensor x_abs; - x_abs.mutable_data(xdim, place); - const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); - r_abs.Run(stream); - - phi::DenseTensor t_cond; - t_cond.mutable_data(xdim, place); - const auto& r_equal = - NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {}); - r_equal.Run(stream); - - phi::DenseTensor t_zero; - t_zero.mutable_data({1}, place); - FillNpuTensorWithConstant(&t_zero, static_cast(0)); - - phi::DenseTensor x_sign; - x_sign.mutable_data(xdim, place); - const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); - r_sign.Run(stream); - - const auto& r_mul = NpuOpRunner("Mul", {x_sign, dy_share}, {*dx}, {}); - r_mul.Run(stream); - - const auto& r_sel = - NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {}); - r_sel.Run(stream); - } else { - phi::DenseTensor x_abs; - x_abs.mutable_data(xdim, place); - const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); - r_abs.Run(stream); - - phi::DenseTensor x_sign; - x_sign.mutable_data(xdim, place); - const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); - r_sign.Run(stream); - - phi::DenseTensor y_pow; - y_pow.mutable_data(ydim, place); - if (porder >= 1) { - const auto& r_pow1 = NpuOpRunner( - "Power", - {x_abs}, - {x_abs}, - {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow1.Run(stream); - - const auto& r_pow2 = NpuOpRunner( - "Power", - {y_share}, - {y_pow}, - {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow2.Run(stream); - - const auto& r_div = NpuOpRunner("DivNoNan", {x_abs, y_pow}, {*dx}, {}); - r_div.Run(stream); - } else { - const auto& r_pow1 = NpuOpRunner( - "Power", - {x_abs}, - {x_abs}, - {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow1.Run(stream); - - const auto& r_pow2 = NpuOpRunner( - "Power", - {y_share}, - {y_pow}, - {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow2.Run(stream); - - const auto& r_div = NpuOpRunner("DivNoNan", {y_pow, x_abs}, {*dx}, {}); - r_div.Run(stream); - } - - const auto& r_mul1 = NpuOpRunner("Mul", {*dx, x_sign}, {*dx}, {}); - r_mul1.Run(stream); - - const auto& r_mul2 = NpuOpRunner("Mul", {*dx, dy_share}, {*dx}, {}); - r_mul2.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - p_norm, - ops::PnormNPUKernel, - ops::PnormNPUKernel); - -REGISTER_OP_NPU_KERNEL( - p_norm_grad, - ops::PnormGradNPUKernel, - ops::PnormGradNPUKernel); diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc deleted file mode 100644 index 0f45d0b51c8373eae06a42936f666a022bdbc5b1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad3d_op_npu.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_t = context.Input("Paddings"); - if (paddings_t) { - paddle::framework::TensorToVector( - *paddings_t, context.device_context(), &paddings); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto in_dims = x->dims(); - - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - float value = context.Attr("value"); - auto data_format = context.Attr("data_format"); - - auto* out = context.Output("Out"); - - PADDLE_ENFORCE_LT(abs(value), - 1e-5, - platform::errors::Unimplemented( - "Ascend npu only support constant_values=0 right now," - "but received constant_value is %f .", - value)); - - PADDLE_ENFORCE_EQ(mode, - "constant", - platform::errors::Unimplemented( - "Ascend npu only support mode=constant right now," - "but received mode is %s .", - mode)); - - std::vector paddings( - {0, 0, 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1]}); - if (data_format == "NCDHW") { - out->Resize({in_dims[0], - in_dims[1], - in_dims[2] + pads[4] + pads[5], - in_dims[3] + pads[2] + pads[3], - in_dims[4] + pads[0] + pads[1]}); - } else { - out->Resize({in_dims[0], - in_dims[1] + pads[4] + pads[5], - in_dims[2] + pads[2] + pads[3], - in_dims[3] + pads[0] + pads[1], - in_dims[4]}); - paddings = { - 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1], 0, 0}; - } - out->mutable_data(context.GetPlace()); - - NpuOpRunner runner; - runner.SetType("PadV3") - .AddInput(*x) - .AddInput(std::move(paddings)) - .AddInput( - std::vector({0})) // npu only support constant_value=0 now - .AddOutput(*out) - .AddAttr("mode", mode); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class Pad3dGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - d_in->mutable_data(context.GetPlace()); - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - - auto stream = - context.template device_context() - .stream(); - - std::vector size( - {d_in_dims[0], d_in_dims[1], d_in_dims[2], d_in_dims[3], d_in_dims[4]}); - if (mode == "constant") { // this method can be only used for constant mode - std::vector offsets({0, 0, pad_front, pad_top, pad_left}); - if (data_format == "NDHWC") { - offsets = {0, pad_front, pad_top, pad_left, 0}; - } - const auto& runner = NpuOpRunner( - "SliceD", {*d_out}, {*d_in}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(pad3d, - ops::Pad3dNPUKernel, - ops::Pad3dNPUKernel, - ops::Pad3dNPUKernel); - -REGISTER_OP_NPU_KERNEL(pad3d_grad, - ops::Pad3dNPUKernel, - ops::Pad3dGradNPUKernel); diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc deleted file mode 100644 index 48c2254b1ec91eece67c18327500fb418d60a123..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_op_npu.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class PadNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto paddings = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - - PADDLE_ENFORCE_LT(abs(pad_value), - 1e-5, - platform::errors::Unimplemented( - "Ascend npu only support pad_value=0 right now," - "but received pad_value is %f .", - pad_value)); - - out->mutable_data(context.GetPlace()); - - NpuOpRunner runner; - runner.SetType("Pad") - .AddInput(*x) - .AddInput(std::move(paddings)) - .AddOutput(*out); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class PadGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - auto paddings = context.Attr>("paddings"); - - d_x->mutable_data(context.GetPlace()); - - auto d_x_dims = d_x->dims(); - auto size = phi::vectorize(d_x_dims); - std::vector offsets(0); - int i = 0; - for (auto iter = paddings.begin(); iter < paddings.end(); ++iter, ++i) { - if (i % 2 == 0) { - offsets.push_back(*iter); - } - } - - auto stream = - context.template device_context() - .stream(); - - const auto& runner = NpuOpRunner( - "SliceD", {*d_out}, {*d_x}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(pad, - ops::PadNPUKernel, - ops::PadNPUKernel, - ops::PadNPUKernel); - -REGISTER_OP_NPU_KERNEL(pad_grad, - ops::PadGradNPUKernel, - ops::PadGradNPUKernel); diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc deleted file mode 100644 index e14c55a63642a14b0dcc47d3eee27dbb3967b55d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_op_npu.cc +++ /dev/null @@ -1,334 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/pooling.h" - -namespace paddle { -namespace operators { - -template -class NPUPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - const Tensor *in_x = ctx.Input("X"); - Tensor *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - - bool global_pooling = ctx.Attr("global_pooling"); - bool ceil_mode = ctx.Attr("ceil_mode"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - auto in_x_dims = in_x->dims(); - auto out_dims = out->dims(); - framework::DDim data_dims; - framework::DDim out_data_dims; - - Tensor in_x_tensor, out_tensor; - in_x_tensor.ShareDataWith(*in_x); - out_tensor.ShareDataWith(*out); - std::vector ksize_vec(4, 1); - std::vector strides_vec(4, 1); - - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1); - ksize_vec[1] = ksize[0]; - ksize_vec[2] = ksize[1]; - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - in_x_tensor.set_layout(DataLayout::kNHWC); - out_tensor.set_layout(DataLayout::kNHWC); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size()); - ksize_vec[2] = ksize[0]; - ksize_vec[3] = ksize[1]; - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - } - phi::funcs::UpdatePadding(&paddings, - global_pooling, - adaptive, - padding_algorithm, - data_dims, - strides, - ksize); -#if (CANN_VERSION_CODE < 512000) - PADDLE_ENFORCE_LT( - std::max(paddings[0], paddings[1]), - ksize[0], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.", - ksize[0], - std::max(paddings[0], paddings[1]))); - PADDLE_ENFORCE_LT( - std::max(paddings[2], paddings[3]), - ksize[1], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", - ksize[1], - std::max(paddings[2], paddings[3]))); -#endif - if (adaptive) { - std::string pooling_mode = "AdaptiveAvgPool2d"; - if (pooling_type == "max") { - pooling_mode = "AdaptiveMaxPool2d"; - } - - // AdaptiveAvgPool2d only support NCHW - Tensor transformed_input, transformed_output; - if (pooling_type == "avg" && channel_last) { - transformed_input.mutable_data( - phi::make_dim( - in_x_dims[0], in_x_dims[3], in_x_dims[1], in_x_dims[2]), - ctx.GetPlace()); - transformed_output.mutable_data( - phi::make_dim(out_dims[0], out_dims[3], out_dims[1], out_dims[2]), - ctx.GetPlace()); - - const auto &trans_runner = - NpuOpRunner("TransData", - {in_x_tensor}, - {transformed_input}, - {{"src_format", std::string("NHWC")}, - {"dst_format", std::string("NCHW")}}); - trans_runner.Run(dev_ctx.stream()); - } else { - transformed_input.ShareDataWith(in_x_tensor); - transformed_output.ShareDataWith(out_tensor); - } - - const auto &runner = - NpuOpRunner(pooling_mode, - {transformed_input}, - {transformed_output}, - {{"output_size", phi::vectorize(out_data_dims)}}); - runner.Run(dev_ctx.stream()); - - if (pooling_type == "avg" && channel_last) { - const auto &trans_runner = - NpuOpRunner("TransData", - {transformed_output}, - {out_tensor}, - {{"src_format", std::string("NCHW")}, - {"dst_format", std::string("NHWC")}}); - trans_runner.Run(dev_ctx.stream()); - } - } else { - std::string pooling_mode = "AvgPoolV2"; - if (pooling_type == "max") { - PADDLE_ENFORCE_EQ( - exclusive, - true, - platform::errors::InvalidArgument( - "MaxPool only support exclusive=false, but got true")); - pooling_mode = "MaxPoolV3"; - } - - const auto &runner = - NpuOpRunner(pooling_mode, - {in_x_tensor}, - {out_tensor}, - {{"ksize", ksize_vec}, - {"strides", strides_vec}, - {"padding_mode", std::string("CALCULATED")}, - {"pads", paddings}, - {"data_format", data_format}, - {"global_pooling", global_pooling}, - {"ceil_mode", ceil_mode}, - {"exclusive", exclusive}}); - runner.Run(dev_ctx.stream()); - } - } -}; - -template -class NPUPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - const Tensor *in_x = ctx.Input("X"); - const Tensor *out = ctx.Input("Out"); - const Tensor *out_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *in_x_grad = - ctx.Output(framework::GradVarName("X")); - in_x_grad->mutable_data(ctx.GetPlace()); - - std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - bool ceil_mode = ctx.Attr("ceil_mode"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - // update paddings - auto in_x_dims = in_x->dims(); - auto out_dims = out->dims(); - framework::DDim data_dims; - framework::DDim out_data_dims; - std::vector ksize_vec(4, 1); - std::vector strides_vec(4, 1); - - Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor; - in_x_tensor.ShareDataWith(*in_x); - out_tensor.ShareDataWith(*out); - out_grad_tensor.ShareDataWith(*out_grad); - in_x_grad_tensor.ShareDataWith(*in_x_grad); - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1); - ksize_vec[1] = ksize[0]; - ksize_vec[2] = ksize[1]; - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - in_x_tensor.set_layout(DataLayout::kNHWC); - out_tensor.set_layout(DataLayout::kNHWC); - out_grad_tensor.set_layout(DataLayout::kNHWC); - in_x_grad_tensor.set_layout(DataLayout::kNHWC); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size()); - ksize_vec[2] = ksize[0]; - ksize_vec[3] = ksize[1]; - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - } - phi::funcs::UpdatePadding(&paddings, - global_pooling, - adaptive, - padding_algorithm, - data_dims, - strides, - ksize); -#if (CANN_VERSION_CODE < 512000) - PADDLE_ENFORCE_LT( - std::max(paddings[0], paddings[1]), - ksize[0], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.", - ksize[0], - std::max(paddings[0], paddings[1]))); - PADDLE_ENFORCE_LT( - std::max(paddings[2], paddings[3]), - ksize[1], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", - ksize[1], - std::max(paddings[2], paddings[3]))); -#endif - if (adaptive || (global_pooling && pooling_type == "max")) { - PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], - 0, - platform::errors::InvalidArgument( - "When adaptive = True, H and W must be divisible, " - "but input dims is %s, output dims is %s", - data_dims, - out_data_dims)); - PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1], - 0, - platform::errors::InvalidArgument( - "When adaptive = True, H and W must be divisible, " - "but input dims is %s, output dims is %s", - data_dims, - out_data_dims)); - if (channel_last) { - strides_vec[1] = data_dims[0] / out_data_dims[0]; - strides_vec[2] = data_dims[1] / out_data_dims[1]; - ksize_vec[1] = strides_vec[1]; - ksize_vec[2] = strides_vec[2]; - } else { - strides_vec[2] = data_dims[0] / out_data_dims[0]; - strides_vec[3] = data_dims[1] / out_data_dims[1]; - ksize_vec[2] = strides_vec[2]; - ksize_vec[3] = strides_vec[3]; - } - } - - NPUAttributeMap attrs = {{"ksize", ksize_vec}, - {"strides", strides_vec}, - {"padding_mode", std::string("CALCULATED")}, - {"pads", paddings}, - {"data_format", data_format}, - {"global_pooling", global_pooling}, - {"ceil_mode", ceil_mode}, - {"exclusive", exclusive}}; - - if (pooling_type == "max") { - if (global_pooling) { - for (auto &s : strides_vec) { - s = 1; - } - PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]), - 255, - platform::errors::InvalidArgument( - "MaxPoolGrad H, W must be less than 255 when " - "global_pooling = True, but got %s", - data_dims)); - attrs["global_pooling"] = false; - } - - const auto &runner = - NpuOpRunner("MaxPoolV3Grad", - {in_x_tensor, out_tensor, out_grad_tensor}, - {in_x_grad_tensor}, - attrs); // 0: floor, 1: ceil - runner.Run(dev_ctx.stream()); - } else if (pooling_type == "avg") { - PADDLE_ENFORCE(strides[0] == strides[1], - platform::errors::InvalidArgument( - "AvgPoolGrad dose not support Asymmetric strides. but " - "strides = (%d, %d)", - strides[0], - strides[1])); - - NpuOpRunner runner; - runner.SetType("AvgPoolV2Grad"); - runner.AddInput(phi::vectorize(in_x->dims())); - runner.AddInput(out_grad_tensor); - runner.AddOutput(in_x_grad_tensor); - runner.AddAttrs(attrs); - runner.Run(dev_ctx.stream()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(pool2d, - ops::NPUPoolOpKernel, - ops::NPUPoolOpKernel); -REGISTER_OP_NPU_KERNEL(pool2d_grad, - ops::NPUPoolGradOpKernel, - ops::NPUPoolGradOpKernel); diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc deleted file mode 100644 index fd03ce027bda57a32b6e70206bfdfb216f5dc0ca..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/randperm_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/randperm_op.h" - -template -using kernel = - paddle::operators::RandpermKernel; - -REGISTER_OP_NPU_KERNEL( - randperm, kernel, kernel, kernel, kernel); diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc deleted file mode 100644 index b2266608d7dca30590e9e04f7929b0b3398fcf32..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/range_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/range_op.h" - -namespace paddle { -namespace operators { - -template -class RangeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* start_t = context.Input("Start"); - auto* end_t = context.Input("End"); - auto* step_t = context.Input("Step"); - auto* out = context.Output("Out"); - - phi::DenseTensor n; - framework::TensorCopy( - *start_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T start = n.data()[0]; - framework::TensorCopy( - *end_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T end = n.data()[0]; - framework::TensorCopy( - *step_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T step = n.data()[0]; - - int64_t size = 0; - GetSize(start, end, step, &size); - - out->Resize(phi::make_ddim({size})); - out->mutable_data(context.GetPlace()); - - std::vector odata; - T value = start; - for (int64_t i = 0; i < size; ++i) { - odata.push_back(value); - value += step; - } - - framework::TensorFromVector(odata, context.device_context(), out); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL(range, - paddle::operators::RangeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::RangeNPUKernel, -#endif - paddle::operators::RangeNPUKernel, - paddle::operators::RangeNPUKernel) diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc deleted file mode 100644 index 068d5d6be12cd360def2a9139b9b112c6dd9d4b7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(range); -USE_OP_DEVICE_KERNEL(range, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto start = scope->Var("Start"); - auto tensor_start = start->GetMutable(); - std::vector init_start; - init_start.push_back(static_cast(1)); - paddle::framework::TensorFromVector(init_start, ctx, tensor_start); - tensor_start->Resize({1}); - - auto end = scope->Var("End"); - auto tensor_end = end->GetMutable(); - std::vector init_end; - init_end.push_back(static_cast(10)); - paddle::framework::TensorFromVector(init_end, ctx, tensor_end); - tensor_end->Resize({1}); - - auto step = scope->Var("Step"); - auto tensor_step = step->GetMutable(); - std::vector init_step; - init_step.push_back(static_cast(2)); - paddle::framework::TensorFromVector(init_step, ctx, tensor_step); - tensor_step->Resize({1}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - auto op = f::OpRegistry::CreateOp( - op_type, - {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}}, - {{"Out", {"Out"}}}, - {}); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - EXPECT_EQ(static_cast(out_vec.size()), static_cast(5)); - EXPECT_EQ(static_cast(out_vec[0]), static_cast(1.0)); - EXPECT_EQ(static_cast(out_vec[1]), static_cast(3.0)); - EXPECT_EQ(static_cast(out_vec[2]), static_cast(5.0)); - EXPECT_EQ(static_cast(out_vec[3]), static_cast(7.0)); - EXPECT_EQ(static_cast(out_vec[4]), static_cast(9.0)); -} - -TEST(range, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "range"); -} diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc deleted file mode 100644 index 2d4497a19e77bbee5f6c3e83f552498351565947..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reshape_op_npu.cc +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/tensor_utils.h" - -namespace paddle { -namespace operators { - -template -class Reshape2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - std::vector target_shape_vector; - auto shape_tensor_vector = ctx.MultiInput("ShapeTensor"); - if (shape_tensor_vector.size() > 0) { - for (auto* shape_tensor : shape_tensor_vector) { - PADDLE_ENFORCE_EQ( - shape_tensor->dims().size(), - 1, - platform::errors::InvalidArgument( - "If the element type of 'shape' in Reshape Op is Tensor, " - "the element's shape must be [1]. But received the element's " - "shape is [%d]", - shape_tensor->dims().size())); - - target_shape_vector.push_back( - phi::GetVectorFromTensor(shape_tensor)[0]); - } - } else { - auto* shape_tensor = ctx.HasInput("Shape") - ? ctx.Input("Shape") - : nullptr; - if (shape_tensor) { - target_shape_vector = phi::GetVectorFromTensor(shape_tensor); - } else { - target_shape_vector = ctx.Attr>("shape"); - PADDLE_ENFORCE_GT( - target_shape_vector.size(), - 0, - platform::errors::InvalidArgument( - "The length of shape attribute should be larger than 0 when " - "input ShapeTensor and Shape are empty!")); - } - } - - int num_negative = - std::count(target_shape_vector.begin(), target_shape_vector.end(), -1); - PADDLE_ENFORCE_LE( - num_negative, - 1, - platform::errors::InvalidArgument( - "The max number of -1 in shape attribute or shape tensor is 1 " - "but received %d.", - num_negative)); - auto it_zero = - std::find(target_shape_vector.begin(), target_shape_vector.end(), 0); - if (it_zero != target_shape_vector.end()) { - int x_rank = x->dims().size(); - for (size_t i = 0; i < target_shape_vector.size(); i++) { - if (target_shape_vector[i] == 0) { - PADDLE_ENFORCE_LT( - i, - x_rank, - platform::errors::InvalidArgument( - "The index of 0 in shape attribute or shape tensor", - "should be less than input dim size, ", - "but the index is %d and input dim size is %d", - i, - x_rank)); - target_shape_vector[i] = x->dims().at(i); - } - } - } - - auto it = - std::find(target_shape_vector.begin(), target_shape_vector.end(), -1); - if (it != target_shape_vector.end()) { - auto ddim_out_vec = phi::vectorize(x->dims()); - int ddim_out_product = std::accumulate( - ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies()); - int reshape_out_product = std::accumulate(target_shape_vector.begin(), - target_shape_vector.end(), - -1, - std::multiplies()); - int index = std::distance(target_shape_vector.begin(), it); - target_shape_vector[index] = ddim_out_product / reshape_out_product; - } - - auto out_dims = phi::make_ddim(target_shape_vector); - out->mutable_data(out_dims, place); - - NpuOpRunner runner; - // the shape input must be on the host side - runner.SetType("Reshape") - .AddInput(*x) - .AddInput(std::vector(target_shape_vector)) - .AddOutput(*out) - .AddAttr("axis", 0) - .AddAttr("num_axes", -1); - runner.Run(stream); - } -}; - -template -class Reshape2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto in_dims = d_x->dims(); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(in_dims); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - reshape2, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel); -REGISTER_OP_NPU_KERNEL( - reshape2_grad, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel); diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc deleted file mode 100644 index 7d15dc2a46558c94f80e1b352e4b8e6cdf8f0398..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class ROIAlignNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); // (B,C,H,W) - auto* ROIs = ctx.Input("ROIs"); // (N,4) - auto* ROIsNum = ctx.Input("RoisNum"); // [0 1 1 2 2 2] - auto* Out = ctx.Output("Out"); - Out->mutable_data(ctx.GetPlace()); - - auto spatial_scale = ctx.Attr("spatial_scale"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto sample_num = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - auto roi_end_mode = 0; - PADDLE_ENFORCE_EQ( - aligned, - false, - platform::errors::InvalidArgument( - "ROIAlignNPU only support Aligned attribute equaled to False")); - - framework::NPUAttributeMap attr_roi = {{"spatial_scale", spatial_scale}, - {"pooled_height", pooled_height}, - {"pooled_width", pooled_width}, - {"sample_num", sample_num}, - {"roi_end_mode", roi_end_mode}}; - - auto stream = - ctx.template device_context() - .stream(); - - // Combine *ROIsNum with ROIs to get new ROIs - // change roisnum's datatype & resize - int dtype = - static_cast(ConvertToNpuDtype(framework::proto::VarType::FP32)); - framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}}; - phi::DenseTensor ROIsNum_fp(ROIs->dtype()); - ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1})); - ROIsNum_fp.mutable_data(ctx.GetPlace()); - - const auto& runner_c = - NpuOpRunner("Cast", {*ROIsNum}, {ROIsNum_fp}, attr_cast); - runner_c.Run(stream); - - // concate to make (N, 5) - std::vector x_list; - x_list.push_back(ROIsNum_fp); - x_list.push_back(*ROIs); - auto axis = 1; - // output of concate - phi::DenseTensor ROIs_N5(ROIs->dtype()); - ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5})); - ROIs_N5.mutable_data(ctx.GetPlace()); - - // attribute of concate - auto EleNum = 2; - framework::NPUAttributeMap attr_concat = {{"N", EleNum}, - {"concat_dim", axis}}; - - NpuOpRunner runner0; - runner0.SetType("ConcatD") - .AddInputs(x_list) - .AddOutput(ROIs_N5) - .AddInputNames({"x0", "x1"}) - .AddAttrs(attr_concat); - runner0.Run(stream); - - const auto& runner = - NpuOpRunner("ROIAlign", {*X, ROIs_N5}, {*Out}, attr_roi); - runner.Run(stream); - } -}; - -template -class ROIAlignNPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sample_num = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - if (!in_grad) { - return; - } - in_grad->mutable_data(place); - - PADDLE_ENFORCE_EQ( - aligned, - false, - platform::errors::InvalidArgument( - "ROIAlignGradNPU only support Aligned attribute equaled to False")); - PADDLE_ENFORCE_EQ( - ctx.HasInput("RoisNum"), - true, - platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp " - "is not found while using NPU.")); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rois->dtype()), - framework::proto::VarType::FP32, - platform::errors::InvalidArgument( - "ROIAlignGradNPU only support ROIs type equaled to FP32.")); - - // Cast RoisNum to fp32 tensor - auto* RoisNum = ctx.Input("RoisNum"); - phi::DenseTensor ROIs_N5; - ROIs_N5.mutable_data({rois_num, 5}, place); - phi::DenseTensor ROIsNum_fp; - ROIsNum_fp.mutable_data(RoisNum->dims(), place); // shape = [rois_num] - int nputype_fp32 = - static_cast(ConvertToNpuDtype(framework::proto::VarType::FP32)); - const auto& runner_cast = NpuOpRunner( - "Cast", {*RoisNum}, {ROIsNum_fp}, {{"dst_type", nputype_fp32}}); - runner_cast.Run(stream); - ROIsNum_fp.Resize({rois_num, 1}); - - // Combine *ROIsNum with ROIs to get new ROIs - std::vector x_list; - x_list.push_back(ROIsNum_fp); - x_list.push_back(*rois); - const auto& runner_concat = NpuOpRunner( - "ConcatD", {x_list}, {ROIs_N5}, {{"N", 2}, {"concat_dim", 1}}); - runner_concat.Run(stream); - - // If CANN version code is less than 504, by analysis, in order to match - // cpu grad version, rois[:,3:5] should substrate 1 before call ascend grad - // function -#if (CANN_VERSION_CODE < 504000) - std::vector vec_dlt = {0, 0, 0, -1.0f, -1.0f}; - phi::DenseTensor tsr_dlt; - tsr_dlt.mutable_data({5}, place); - framework::TensorFromVector(vec_dlt, ctx.device_context(), &tsr_dlt); - ctx.template device_context().Wait(); - const auto& runner_add = - NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {}); - runner_add.Run(stream); -#endif - - // Call ascend RoiAlignGrad function - int roi_end_mode = 0; - const auto& runner_roi_align_grad = - NpuOpRunner("ROIAlignGrad", - {*out_grad, ROIs_N5}, - {*in_grad}, - {{"xdiff_shape", phi::vectorize(in_dims)}, - {"pooled_width", pooled_width}, - {"pooled_height", pooled_height}, - {"spatial_scale", spatial_scale}, - {"sample_num", sample_num}, - {"roi_end_mode", roi_end_mode}}); - runner_roi_align_grad.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - roi_align, - ops::ROIAlignNPUKernel, - ops::ROIAlignNPUKernel, - ops::ROIAlignNPUKernel); - -REGISTER_OP_NPU_KERNEL(roi_align_grad, - ops::ROIAlignNPUGradKernel, - ops::ROIAlignNPUGradKernel, - ops::ROIAlignNPUGradKernel); diff --git a/paddle/fluid/operators/run_program_op_npu.cc b/paddle/fluid/operators/run_program_op_npu.cc deleted file mode 100644 index e45ce0a2bef9ffee7b36e76570059569258375ef..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/run_program_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/sampling_id_op_npu.cc b/paddle/fluid/operators/sampling_id_op_npu.cc deleted file mode 100644 index 5657edcfa35bb32e4eb44f4813ff11f88d9bd71a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sampling_id_op_npu.cc +++ /dev/null @@ -1,20 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sampling_id_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(sampling_id, - paddle::operators::SamplingIdKernel, - paddle::operators::SamplingIdKernel); diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc deleted file mode 100644 index 1fb136a5110dbd512f8d3da323873a0e68d6edfc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/save_combine_op_npu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/save_combine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - save_combine, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc deleted file mode 100644 index d6063d66f1531c17a526145085fe857a78ae4040..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/save_op_npu.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/save_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - save, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc deleted file mode 100644 index c25a49c4f3b6002b9e00844c256a2259ec9026de..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scale_op_npu.cc +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -static inline T GetAttrFromTensor(const phi::DenseTensor* tensor) { - const auto* tensor_data = tensor->data(); - phi::DenseTensor cpu_tensor; - if (platform::is_gpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - paddle::framework::TensorCopySync( - *tensor, platform::CPUPlace(), &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -template -class ScaleNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto scale = ctx.Attr("scale"); - auto bias = ctx.Attr("bias"); - auto bias_after_scale = ctx.Attr("bias_after_scale"); - auto stream = - ctx.template device_context() - .stream(); - float power = 1.0; - VLOG(4) << "scale:" << scale << ", bias:" << bias - << " ,bias_after_scale:" << bias_after_scale; - if (ctx.HasInput("ScaleTensor")) { - auto* scale_tensor = ctx.Input("ScaleTensor"); - scale = static_cast(GetAttrFromTensor(scale_tensor)); - } - if (isinf(scale)) { - if (signbit(scale)) { - scale = -std::numeric_limits::max(); - } else { - scale = std::numeric_limits::max(); - } - } - if (!bias_after_scale) { - bias *= scale; - } - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attrs = { - {"power", power}, {"scale", scale}, {"shift", bias}}; - const auto& dev_ctx = - ctx.template device_context(); - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& muls_runner = NpuOpRunner( - "Muls", {inputs[0]}, {outputs[0]}, {{"value", attrs.at("scale")}}); - muls_runner.Run(dev_ctx.stream()); - - const auto& adds_runner = NpuOpRunner( - "Adds", {outputs[0]}, {outputs[0]}, {{"value", attrs.at("shift")}}); - adds_runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attrs, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT64) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attrs, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - scale, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel); diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc deleted file mode 100644 index b2b09faaa9d44590cb0fad49812d1d4e2662da90..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc deleted file mode 100644 index 1843e993d552a1e44058d3390b73b989e96c2841..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/seed_op_npu.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/seed_op.h" - -namespace paddle { -namespace operators { - -template -class NPUSeedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Output("Out"); - int user_seed = ctx.Attr("seed"); - std::random_device rnd; - int seed; - - if (user_seed != 0) { - seed = user_seed; - } else { - seed = rnd(); - } - - out->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(out, seed); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - seed, ops::NPUSeedKernel); diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc deleted file mode 100644 index b572e98eb81e9fffd6cf638448473330c30c7a71..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/set_value_op.h" -#include "paddle/phi/kernels/funcs/slice_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class SetValueNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = - phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = - phi::funcs::GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - paddle::framework::TensorCopy(*in, ctx.GetPlace(), out); - - auto starts_indices = std::vector(in_dims.size(), 0); - auto ends_indices = std::vector(in_dims.size(), 0); - auto strides_indices = std::vector(in_dims.size(), 0); - - for (int i = 0; i < in_dims.size(); ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - } - - int64_t stride_step = phi::product(in_dims); - std::vector index_indices(1, 0); - for (size_t i = 0; i < strides_indices.size(); ++i) { - auto index_size = index_indices.size(); - stride_step /= in_dims[i]; - for (size_t j = 0; j < index_size; ++j) { - auto start_index = *index_indices.begin(); - if (strides_indices[i] > 0) { - for (int64_t k = starts_indices[i]; k < ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } else { - for (int64_t k = starts_indices[i]; k > ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } - index_indices.erase(index_indices.begin()); - } - } - - PADDLE_ENFORCE_EQ( - static_cast(index_indices.size()), - phi::product(slice_dims_for_assign), - platform::errors::InvalidArgument( - "OP(set_value) error index indices and value update not match ")); - - phi::DenseTensor value_t(in->type()); - if (value_tensor != nullptr) { - value_t.ShareDataWith(*value_tensor); - } else { - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, ctx.GetPlace()); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVectorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - } - - auto stream = ctx.template device_context().stream(); - - phi::DenseTensor value_temp(in->type()); - if (slice_dims_for_assign == value_t.dims()) { - value_temp.ShareDataWith(value_t); - } else { - value_temp.Resize(slice_dims_for_assign); - value_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(value_t) - .AddInput(phi::vectorize(slice_dims_for_assign)) - .AddOutput(value_temp) - .Run(stream); - } - - int64_t input_numel = phi::product(in_dims); - int64_t index_numel = index_indices.size(); - - phi::DenseTensor in_temp, out_temp, val_temp; - in_temp.ShareDataWith(*in); - out_temp.ShareDataWith(*out); - val_temp.ShareDataWith(value_temp); - in_temp.Resize(phi::make_ddim({input_numel})); - out_temp.Resize(phi::make_ddim({input_numel})); - val_temp.Resize(phi::make_ddim({index_numel})); - - NpuOpRunner runner; - runner.SetType("ScatterUpdate") - .AddInput(in_temp) - .AddInput(std::move(index_indices)) - .AddInput(val_temp) - .AddOutput(out_temp) -#if (CANN_VERSION_CODE >= 504000) - .AddAttrs({{"use_locking", false}}) -#endif - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(set_value, - ops::SetValueNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::SetValueNPUKernel, -#endif - ops::SetValueNPUKernel) diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc deleted file mode 100644 index 76f4539e70b2f7e8982bb33d6035d779d315f2b1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op_npu.cc +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ShapeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("Input"); - auto* out_t = ctx.Output("Out"); - out_t->Resize({x->dims().size()}); - out_t->mutable_data(ctx.GetPlace()); - - // The output data type defaults to int32. - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - runner.SetType("Shape").AddInput(*x).AddOutput(*out_t).AddAttr( - "dtype", static_cast(dst_dtype)); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - shape, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel); diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc deleted file mode 100644 index 4181db1d8e04cd94e3d1b3021eba0fab59cff028..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ShardIndexNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - VLOG(4) << "start kernel"; - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - - PADDLE_ENFORCE_GT( - index_num, - 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, - 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, - 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, - nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, - shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - auto place = context.GetPlace(); - out->Resize(in->dims()); - out->set_lod(in->lod()); - out->mutable_data(place); - - phi::DenseTensor tmp(in->type()); - tmp.mutable_data(framework::DDim({1}), place); - FillNpuTensorWithConstant(&tmp, shard_size); - - phi::DenseTensor condition(phi::DataType::BOOL); - condition.mutable_data(in->dims(), place); - - phi::DenseTensor tmp2(in->type()); - tmp2.mutable_data(in->dims(), place); - - phi::DenseTensor tmp3(in->type()); - tmp3.mutable_data(in->dims(), place); - - auto stream = - context.template device_context() - .stream(); - - NpuOpRunner runner; - runner.AddInputs({*in, tmp}); - runner.AddOutputs({tmp2}); - runner.SetType("Mod"); - runner.Run(stream); - - NpuOpRunner runner1; - runner1.AddInputs({*in, tmp}); - runner1.AddOutputs({tmp3}); - runner1.SetType("FloorDiv"); - runner1.Run(stream); - - FillNpuTensorWithConstant(&tmp, shard_id); - NpuOpRunner runner2; - runner2.AddInputs({tmp3, tmp}); - runner2.AddOutputs({condition}); - runner2.SetType("Equal"); - runner2.Run(stream); - - phi::DenseTensor tmp4(in->type()); - tmp4.mutable_data(in->dims(), place); - FillNpuTensorWithConstant(&tmp4, ignore_value); - tmp4.Resize(in->dims()); - - NpuOpRunner runner3; - runner3.AddInputs({condition, tmp2, tmp4}); - runner3.AddOutputs({*out}); - runner3.SetType("Select"); - runner3.Run(stream); - } -}; -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(shard_index, - ops::ShardIndexNPUKernel, - ops::ShardIndexNPUKernel); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc deleted file mode 100644 index 0d4ad6331e80708eec2b920b5193276fef4b4d04..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -const int kIgnoreIndex = -100; - -void CheckAttrs(const framework::ExecutionContext& ctx) { - // Add this check is due to Ascend SigmoidCrossEntropyWithLogits - // and SigmoidCrossEntropyWithLogitsGrad does't supoort - // attr normalize and ignore_index - bool normalize = ctx.Attr("normalize"); - int ignore_index = ctx.Attr("ignore_index"); - PADDLE_ENFORCE_EQ(normalize, - false, - platform::errors::InvalidArgument( - "attr normalize must be false, but got true")); - PADDLE_ENFORCE_EQ(ignore_index, - kIgnoreIndex, - platform::errors::InvalidArgument( - "attr ignore_index must be default %d, but got %d", - kIgnoreIndex, - ignore_index)); -} - -template -class SigmoidCrossEntropyWithLogitsNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - CheckAttrs(ctx); - - auto* x = ctx.Input("X"); - auto* label = ctx.Input("Label"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("SigmoidCrossEntropyWithLogits", {*x, *label}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SigmoidCrossEntropyWithLogitsNPUGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - CheckAttrs(ctx); - - auto* x = ctx.Input("X"); - auto* label = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner( - "SigmoidCrossEntropyWithLogitsGrad", {*x, *label, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsNPUKernel, - ops::SigmoidCrossEntropyWithLogitsNPUKernel); -REGISTER_OP_NPU_KERNEL( - sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsNPUGradKernel, - ops::SigmoidCrossEntropyWithLogitsNPUGradKernel); diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc deleted file mode 100644 index 594b0cc18e886ad3f17af98e001465d2d3d1047e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/size_op_npu.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SizeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - Tensor cpu_tensor; - auto cpu_data = - cpu_tensor.mutable_data(out->dims(), platform::CPUPlace()); - cpu_data[0] = x->numel(); - paddle::framework::TensorCopy( - cpu_tensor, - ctx.GetPlace(), - ctx.template device_context(), - out); - ctx.template device_context().Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - size, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel); diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc deleted file mode 100644 index a54ba630b274c0c68ef72db97971c684490b6c35..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/slice_op_npu.cc +++ /dev/null @@ -1,254 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/slice_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -void UpdateAttr(const framework::DDim& in_dims, - const std::vector axes, - const std::vector starts, - const std::vector ends, - std::vector* offsets, - std::vector* size) { - int cnt = 0; - for (int i = 0; i < in_dims.size(); ++i) { - int start = 0; - int end = in_dims[i]; - // NOTE(zhiqiu): Becareful that cnt may > axes.size() and result in - // overflow. - int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; - if (axis == i) { - start = starts[cnt]; - if (start < 0) { - start = (start + in_dims[i]); - } - start = std::max(start, static_cast(0)); - end = ends[cnt]; - if (end < 0) { - end = (end + in_dims[i]); - } - end = std::min(end, static_cast(in_dims[i])); - cnt++; - } - - (*offsets)[i] = start; - (*size)[i] = end - start; - } -} - -template -class SliceNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - - auto axes_int = ctx.Attr>("axes"); - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - std::vector axes(axes_int.begin(), axes_int.end()); - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - - auto decrease_axis = ctx.Attr>("decrease_axis"); - auto infer_flags = ctx.Attr>("infer_flags"); - - const auto& in_dims = input->dims(); - - // Get the accurate attribute value of starts and ends - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - if (ctx.HasInput("StartsTensor")) { - starts = phi::GetVectorFromTensor( - ctx.Input("StartsTensor")); - } else if (starts_tensor_list.size() > 0) { - starts = GetDataFromTensorList(starts_tensor_list); - } - - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - if (ctx.HasInput("EndsTensor")) { - ends = phi::GetVectorFromTensor( - ctx.Input("EndsTensor")); - } else if (ends_tensor_list.size() > 0) { - ends = GetDataFromTensorList(ends_tensor_list); - } - - PADDLE_ENFORCE_EQ( - starts.size(), - axes.size(), - platform::errors::InvalidArgument( - "The size of starts must be equal to the size of axes.")); - PADDLE_ENFORCE_EQ( - ends.size(), - axes.size(), - platform::errors::InvalidArgument( - "The size of ends must be equal to the size of axes.")); - - if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") || - starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) { - // Infer output dims - auto out_dims = out->dims(); - auto slice_dims = out_dims; - for (size_t i = 0; i < axes.size(); ++i) { - // when start == -1 && end == start+1 - if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { - auto ret = - std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); - if (ret != decrease_axis.end()) { - ends[i] = in_dims[axes[i]]; - } - } - } - - phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends); - slice_dims = phi::funcs::GetSliceDims( - in_dims, axes, starts, ends, nullptr, nullptr); - out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis); - - out->Resize(out_dims); - } - - out->mutable_data(ctx.GetPlace()); - - std::vector offsets(in_dims.size()); - std::vector size(in_dims.size()); - - UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); -#if CANN_VERSION_CODE < 512000 - const auto& runner = - NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, { - "size", - size - }}); -#else - NpuOpRunner runner; - runner.SetType("Slice") - .AddInput(*input) - .AddInput(std::move(offsets)) - .AddInput(std::move(size)) - .AddOutput(*out); -#endif - runner.Run(stream); - } -}; - -template -class SliceGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dinput = - ctx.Output(framework::GradVarName("Input")); - - auto axes_int = ctx.Attr>("axes"); - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - std::vector axes(axes_int.begin(), axes_int.end()); - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - - // Get the accurate attribute value of starts and ends - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - if (ctx.HasInput("StartsTensor")) { - starts = phi::GetVectorFromTensor( - ctx.Input("StartsTensor")); - } else if (starts_tensor_list.size() > 0) { - starts = GetDataFromTensorList(starts_tensor_list); - } - - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - if (ctx.HasInput("EndsTensor")) { - ends = phi::GetVectorFromTensor( - ctx.Input("EndsTensor")); - } else if (ends_tensor_list.size() > 0) { - ends = GetDataFromTensorList(ends_tensor_list); - } - - const auto& in_dims = input->dims(); - int rank = in_dims.size(); - - std::vector offsets(rank); - std::vector size(rank); - UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - - std::vector> paddings(rank, std::vector(2)); - for (int i = 0; i < rank; ++i) { - paddings[i][0] = static_cast(offsets[i]); - paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); - } - - phi::DenseTensor tmp_dout; - tmp_dout.ShareDataWith(*dout); - auto out_dims = dout->dims(); - auto decrease_axis = ctx.Attr>("decrease_axis"); - auto decrease_size = decrease_axis.size(); - if (decrease_size > 0) { - if (decrease_size == static_cast(in_dims.size())) { - out_dims = phi::make_ddim(std::vector(decrease_size, 1)); - } else { - std::vector origin_out_shape(out_dims.size() + decrease_size, -1); - for (size_t i = 0; i < decrease_size; ++i) { - origin_out_shape[decrease_axis[i]] = 1; - } - int index = 0; - for (size_t i = 0; i < origin_out_shape.size(); ++i) { - if (origin_out_shape[i] == -1) { - origin_out_shape[i] = out_dims[index]; - ++index; - } - } - out_dims = phi::make_ddim(origin_out_shape); - } - tmp_dout.Resize(out_dims); - } - - dinput->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(slice, - ops::SliceNPUKernel, - ops::SliceNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::SliceNPUKernel, -#endif - ops::SliceNPUKernel); - -REGISTER_OP_NPU_KERNEL(slice_grad, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc deleted file mode 100644 index abb6353ca0d1da624170ee0ad19c0468999342fc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/smooth_l1_loss_op.h" - -namespace paddle { -namespace operators { - -template -class SmoothL1LossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in_x = context.Input("X"); - auto* in_y = context.Input("Y"); - auto* inside_weight = context.Input("InsideWeight"); - auto* outside_weight = context.Input("OutsideWeight"); - auto* out_diff = context.Output("Diff"); - auto* out_loss = context.Output("Out"); - out_diff->mutable_data(context.GetPlace()); - out_loss->mutable_data(context.GetPlace()); - - auto sigma = context.Attr("sigma"); - T sigma2 = 1.0 / (sigma * sigma); - bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); - // out_diff = in_x - in_y - auto stream = - context.template device_context() - .stream(); - const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {}); - runner1.Run(stream); - - phi::DenseTensor no_reduce_loss(in_x->dtype()); - no_reduce_loss.Resize(in_x->dims()); - no_reduce_loss.mutable_data(context.GetPlace()); - // multiply inside weight before get the loss - if (has_weight) { - phi::DenseTensor tmp_diff(out_diff->dtype()); - tmp_diff.Resize(out_diff->dims()); - tmp_diff.mutable_data(context.GetPlace()); - const auto& runner2 = - NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {}); - runner2.Run(stream); - framework::TensorCopy( - tmp_diff, - context.GetPlace(), - context.template device_context(), - out_diff); - - phi::DenseTensor tmp_x(in_x->dtype()); - tmp_x.Resize(in_x->dims()); - tmp_x.mutable_data(context.GetPlace()); - - phi::DenseTensor tmp_y(in_y->dtype()); - tmp_y.Resize(in_y->dims()); - tmp_y.mutable_data(context.GetPlace()); - - // mul input and inside_weight - const auto& runner_x = - NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {}); - runner_x.Run(stream); - const auto& runner_y = - NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {}); - runner_y.Run(stream); - const auto& runner3 = NpuOpRunner("SmoothL1Loss", - {tmp_x, tmp_y}, - {no_reduce_loss}, - {{"sigma", sigma2}}); - runner3.Run(stream); - } else { - const auto& runner3 = NpuOpRunner("SmoothL1Loss", - {*in_x, *in_y}, - {no_reduce_loss}, - {{"sigma", sigma2}}); - runner3.Run(stream); - } - - // multiply outside weight and loss - // reduceSum because the output'shape must be [B,1] - if (has_weight) { - phi::DenseTensor tmp_loss(no_reduce_loss.dtype()); - tmp_loss.Resize(no_reduce_loss.dims()); - tmp_loss.mutable_data(context.GetPlace()); - const auto& runner4 = - NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {}); - runner4.Run(stream); - const auto& runner5 = - NpuOpRunner("ReduceSumD", - {tmp_loss}, - {*out_loss}, - {{"axes", std::vector{1}}, {"keep_dims", true}}); - runner5.Run(stream); - } else { - const auto& runner5 = - NpuOpRunner("ReduceSumD", - {no_reduce_loss}, - {*out_loss}, - {{"axes", std::vector{1}}, {"keep_dims", true}}); - runner5.Run(stream); - } - } -}; - -template -class SmoothL1LossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* inside_weight = context.Input("InsideWeight"); - auto* outside_weight = context.Input("OutsideWeight"); - auto* diff = context.Input("Diff"); - auto* og = context.Input(framework::GradVarName("Out")); - auto* outx_grad = - context.Output(framework::GradVarName("X")); - auto* outy_grad = - context.Output(framework::GradVarName("Y")); - auto sigma = context.Attr("sigma"); - T sigma2 = 1.0 / (sigma * sigma); - bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); - - auto stream = - context.template device_context() - .stream(); - - // diff == in_x - in_y == diff - 0 - phi::DenseTensor tmp_zero(diff->dtype()); - tmp_zero.Resize(diff->dims()); - tmp_zero.mutable_data(context.GetPlace()); - const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {}); - runner_zero.Run(stream); - - phi::DenseTensor grad(diff->dtype()); - grad.Resize(diff->dims()); - grad.mutable_data(context.GetPlace()); - // broadcast og(output_grad) to adapt to the npu interface - const auto& runner_broad = - NpuOpRunner("BroadcastToD", - {*og}, - {grad}, - {{"shape", phi::vectorize(diff->dims())}}); - runner_broad.Run(stream); - - phi::DenseTensor gradient(diff->dtype()); - gradient.Resize(diff->dims()); - gradient.mutable_data(context.GetPlace()); - // diff == diff - 0 == in_x - in_y - const auto& runner_grad = NpuOpRunner("SmoothL1LossGrad", - {*diff, tmp_zero, grad}, - {gradient}, - {{"sigma", sigma2}}); - runner_grad.Run(stream); - - // mul weight and gradient - if (has_weight) { - phi::DenseTensor weight(inside_weight->dtype()); - weight.Resize(inside_weight->dims()); - weight.mutable_data(context.GetPlace()); - const auto& runner_weight = - NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {}); - runner_weight.Run(stream); - - phi::DenseTensor tmp_grad(gradient.dtype()); - tmp_grad.Resize(gradient.dims()); - tmp_grad.mutable_data(context.GetPlace()); - const auto& runner_weight_grad = - NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {}); - runner_weight_grad.Run(stream); - - framework::TensorCopy( - tmp_grad, - context.GetPlace(), - context.template device_context(), - &gradient); - } - // outx_grad = gradient - if (outx_grad) { - outx_grad->mutable_data(context.GetPlace()); - framework::TensorCopy( - gradient, - context.GetPlace(), - context.template device_context(), - outx_grad); - } - - // outy_grad = - gradient - if (outy_grad) { - outy_grad->mutable_data(context.GetPlace()); - phi::DenseTensor coeff(phi::DataType::FLOAT32); - coeff.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&coeff, -1); - const auto& runner_y_grad = - NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {}); - runner_y_grad.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - smooth_l1_loss, - ops::SmoothL1LossNPUKernel); - -REGISTER_OP_NPU_KERNEL( - smooth_l1_loss_grad, - ops::SmoothL1LossGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc deleted file mode 100644 index de7df0de5b3d589baf0883769ecbb140bab1af77..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/softmax_op_npu.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -template -class SoftmaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto axis = ctx.Attr("axis"); - std::vector axes; - axes.push_back(axis); - framework::NPUAttributeMap attr_input = {{"axes", axes}}; - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class SoftmaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - - auto* dX = ctx.Output(framework::GradVarName("X")); - - auto dims = dX->dims(); - const int rank = dims.size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - int64_t first_dim = 1; - int64_t sec_dim = 1; - for (int i = 0; i < axis; i++) { - first_dim *= dims[i]; - } - for (int i = axis; i < rank; i++) { - sec_dim *= dims[i]; - } - - Tensor tmp_out; - tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim}); - - Tensor tmp_dOut; - tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim}); - - dX->Resize(phi::make_ddim({first_dim, sec_dim})); - dX->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {}; - const auto& runner = NpuOpRunner( - std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, {*dX}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - dX->Resize(dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - softmax, - ops::SoftmaxNPUKernel, - ops::SoftmaxNPUKernel, - ops::SoftmaxNPUKernel); - -REGISTER_OP_NPU_KERNEL( - softmax_grad, - ops::SoftmaxGradNPUKernel, - ops::SoftmaxGradNPUKernel, - ops::SoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc deleted file mode 100644 index dd1462b1c07cc52516764fa4e21cee118f2d18b2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(softmax); -USE_OP_DEVICE_KERNEL(softmax, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - for (int i = 3; i < 9; ++i) { - init.push_back(static_cast(i)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({2, 3}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({2, 3}); - tensor_out->mutable_data(place); // allocate - - // run - int axis = 1; - f::AttributeMap attrs = { - {"axis", axis}, - {"use_cudnn", false}, - {"use_mkldnn", false}, - {"mkldnn_data_type", std::string("float32")}, - {"is_test", false}, - }; - - auto op = f::OpRegistry::CreateOp( - "softmax", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - for (int i = 0; i < static_cast(out_vec.size()); ++i) { - VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; - } - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector out_init; - - out_init.push_back(static_cast(0.6670)); - out_init.push_back(static_cast(0.5888)); - out_init.push_back(static_cast(0.4543)); - out_init.push_back(static_cast(0.3330)); - out_init.push_back(static_cast(0.4112)); - out_init.push_back(static_cast(0.5457)); - - paddle::framework::TensorFromVector(out_init, ctx, tensor_out); - tensor_out->Resize({2, 3}); - - ctx.Wait(); - - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - std::vector dout_init; - for (int i = 0; i < 6; ++i) { - dout_init.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout); - tensor_dout->Resize({2, 3}); - - ctx.Wait(); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs = { - {"name", std::string("softmax_grad")}, - {"axis", static_cast(0)}, - {"use_cudnn", false}, - {"use_mkldnn", false}, - {"mkldnn_data_type", std::string("float32")}, - {"is_test", false}, - {"data_format", std::string("AnyLayout")}, - }; - auto op = f::OpRegistry::CreateOp("softmax_grad", - {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2)); - EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3)); - - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); - EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1); - EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1); - EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1); - EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1); - EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1); - EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1); -} - -TEST(softmax, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(softmax_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc deleted file mode 100644 index af0e9d55445d5ee9967c7222276f9230404307fe..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" -#include "paddle/phi/kernels/funcs/cross_entropy.h" -#include "paddle/phi/kernels/funcs/softmax.h" - -namespace paddle { -namespace operators { - -template -class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* logits = ctx.Input("Logits"); - auto* labels = ctx.Input("Label"); - auto* softmax = ctx.Output("Softmax"); - auto* loss = ctx.Output("Loss"); - auto* backprop = ctx.Output("Backprop"); - auto soft_label = ctx.Attr("soft_label"); - PADDLE_ENFORCE_EQ(soft_label, - false, - platform::errors::Unimplemented( - "soft_label=True is not supported in " - "the npu kernel of softmax_with_cross_entropy.")); - - const int rank = logits->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - const int n = phi::funcs::SizeToAxis(axis, logits->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); - - PADDLE_ENFORCE_EQ( - labels->numel(), - n, - platform::errors::Unimplemented( - "The size of labels should be equal to phi::funcs::SizeToAxis of " - "logits," - "but got size of labels is %d and phi::funcs::SizeToAxis is %d.", - labels->numel(), - n)); - - loss->mutable_data(ctx.GetPlace()); - backprop->mutable_data(ctx.GetPlace()); - softmax->mutable_data(ctx.GetPlace()); - - phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d; - logits_2d.ShareDataWith(*logits).Resize({n, d}); - labels_1d.ShareDataWith(*labels).Resize({n}); - loss_1d.ShareDataWith(*loss).Resize({n}); - backprop_2d.ShareDataWith(*backprop).Resize({n, d}); - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - for (auto i = axis; i < logits->dims().size(); ++i) { - axes.push_back(i); - } - const auto& runner_softmax = - NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}}); - runner_softmax.Run(stream); - - // SparseSoftmaxCrossEntropyWithLogits - const auto& runner_s = NpuOpRunner("SparseSoftmaxCrossEntropyWithLogits", - {logits_2d, labels_1d}, - {loss_1d, backprop_2d}, - {}); - runner_s.Run(stream); - } -}; - -template -class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* backprop = ctx.Input("Backprop"); - auto* loss_grad = - ctx.Input(framework::GradVarName("Loss")); - auto* logits_grad = - ctx.Output(framework::GradVarName("Logits")); - - PADDLE_ENFORCE_NOT_NULL(backprop, - platform::errors::PreconditionNotMet( - "backprop should not be null in NPU kernel of " - "softmax_with_cross_entropy_grad.")); - logits_grad->mutable_data(ctx.GetPlace()); - - const int rank = logits_grad->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims()); - - phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d; - - logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d}); - loss_grad_1d.ShareDataWith(*loss_grad).Resize({n}); - backprop_2d.ShareDataWith(*backprop).Resize({n, d}); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_mul = - NpuOpRunner("Mul", {*loss_grad, *backprop}, {*logits_grad}, {}); - runner_mul.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyNPUKernel, - ops::SoftmaxWithCrossEntropyNPUKernel); -REGISTER_OP_NPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradNPUKernel< - paddle::platform::NPUDeviceContext, - float>, - ops::SoftmaxWithCrossEntropyGradNPUKernel< - paddle::platform::NPUDeviceContext, - paddle::platform::float16>); diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc deleted file mode 100644 index 763b375d00e9b8c8eb54525b67f18a0d94ca8be7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/split_op_npu.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/split_op.h" - -namespace paddle { -namespace operators { - -template -class SplitNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto outs = ctx.MultiOutput("Out"); - int num = ctx.Attr("num"); - std::vector sections = ctx.Attr>("sections"); - int axis = ctx.Attr("axis"); - - if (ctx.HasInput("AxisTensor")) { - // TODO(liupeng51): - PADDLE_THROW(platform::errors::Unimplemented( - "The AxisTensor is not supported on NPU now.")); - } - if (ctx.HasInput("SectionsTensorList")) { - // TODO(liupeng51): - PADDLE_THROW(platform::errors::Unimplemented( - "The SectionsTensorList is not supported on NPU now.")); - } - - std::vector outputs; - for (size_t j = 0; j < outs.size(); ++j) { - outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(*outs[j]); - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - if (sections.size() == 0) { - framework::NPUAttributeMap attr_input = {{"num_split", num}, - {"split_dim", axis}}; - runner.SetType("SplitD").AddInputs({*in}).AddOutputs(outputs).AddAttrs( - attr_input); - } else { - framework::NPUAttributeMap attr_input = { - {"size_splits", sections}, - {"split_dim", axis}, - {"num_split", static_cast(sections.size())}}; - runner.SetType("SplitVD").AddInput(*in).AddOutputs(outputs).AddAttrs( - attr_input); - } - - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(split, - ops::SplitNPUKernel, - ops::SplitNPUKernel, - ops::SplitNPUKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc deleted file mode 100644 index fb7d4607fc085f74895624744980d80a1b4c4fa2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SquaredL2NormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *out = context.Output("Out"); - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - std::vector axis; - for (int i = 0; i < x->dims().size(); ++i) { - axis.push_back(i); - } - out->mutable_data(place); - const auto &runner = NpuOpRunner( - "SquareSumV1", {*x}, {*out}, {{"axis", axis}, {"keep_dims", false}}); - runner.Run(stream); - } -}; - -template -class SquaredL2NormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *x_grad = - context.Output(framework::GradVarName("X")); - auto *out_grad = - context.Input(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ( - out_grad->numel(), - 1, - platform::errors::InvalidArgument( - "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar.")); - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - // broadcast out_grad - phi::DenseTensor broadcasted_out_grad; - broadcasted_out_grad.mutable_data(x_grad->dims(), place); - const auto &broadcast_runner = - NpuOpRunner("BroadcastToD", - {*out_grad}, - {broadcasted_out_grad}, - {{"shape", phi::vectorize(x_grad->dims())}}); - broadcast_runner.Run(stream); - // mul x - phi::DenseTensor tmp_x_grad; - tmp_x_grad.mutable_data(x_grad->dims(), place); - const auto &mul_x_runner = - NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {}); - mul_x_runner.Run(stream); - // mul coefficient:2 - phi::DenseTensor coefficient; - coefficient.mutable_data({1}, place); - FillNpuTensorWithConstant(&coefficient, static_cast(2.0)); - x_grad->mutable_data(place); - const auto &mul_coefficient_runner = - NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {}); - mul_coefficient_runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - squared_l2_norm, - ops::SquaredL2NormNPUKernel); -REGISTER_OP_NPU_KERNEL( - squared_l2_norm_grad, - ops::SquaredL2NormGradNPUKernel); diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc deleted file mode 100644 index 308f092ad740f1daba00ed62a2dd3012f803fe1e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/squeeze_op_npu.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/squeeze_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel); -REGISTER_OP_NPU_KERNEL( - squeeze2, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel); -REGISTER_OP_NPU_KERNEL( - squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel); -REGISTER_OP_NPU_KERNEL( - squeeze2_grad, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc deleted file mode 100644 index f0f683e4882465f2a71d45eff2a8f09f5a2d409b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(squeeze); -USE_OP_DEVICE_KERNEL(squeeze, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - int dim0 = 1; - int dim1 = 10; - int dim2 = 1; - - std::vector init; - for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) { - init.push_back(static_cast(0.1)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({dim0, dim1, dim2}); - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector axis; - axis.push_back(2); - f::AttributeMap attrs = {{"axes", axis}}; - - auto op = f::OpRegistry::CreateOp( - "squeeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2)); - EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0)); - EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1)); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(0.1)); - } - - ctx.Wait(); -} - -TEST(squeeze, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc deleted file mode 100644 index 8c6447971d9ad26b99b92c783290cc601eaaa9e2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/stack_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class StackNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* y = ctx.Output("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - int num = static_cast(x.size()); - - PADDLE_ENFORCE_GT(num, - 0, - platform::errors::InvalidArgument( - "number of input phi::DenseTensor <= 0")); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector x_list; - for (int i = 0; i < num; i++) { - x_list.push_back(*x[i]); - } - y->mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); - runner.Run(stream); - } -}; - -template -class StackGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int num = dy->dims()[axis]; - - PADDLE_ENFORCE_GT(num, - 0, - platform::errors::InvalidArgument( - "number of input phi::DenseTensor <= 0")); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector dx_list; - for (int i = 0; i < num; i++) { - dx[i]->mutable_data(ctx.GetPlace()); - dx_list.push_back(*dx[i]); - } - - const auto& runner = - NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - stack, - paddle::operators::StackNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::StackNPUKernel, -#endif - paddle::operators::StackNPUKernel, - paddle::operators::StackNPUKernel); - -REGISTER_OP_NPU_KERNEL( - stack_grad, - paddle::operators::StackGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::StackGradNPUKernel, -#endif - paddle::operators::StackGradNPUKernel, - paddle::operators::StackGradNPUKernel); diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc deleted file mode 100644 index 4c3bfed5d5d4bc1c72fbe69bc9ea7c76e63e4a25..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/strided_slice_op_npu.cc +++ /dev/null @@ -1,480 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/strided_slice.h" - -namespace paddle { -namespace operators { - -using Variable = framework::Variable; -using LoDTensorArray = framework::LoDTensorArray; -using DDim = framework::DDim; - -template -class StridedSliceNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - PADDLE_ENFORCE_EQ(is_tensor_array, - false, - platform::errors::InvalidArgument( - "phi::DenseTensor array as input is not supported.")); - int rank = ctx.Input("Input")->dims().size(); - switch (rank) { - case 1: - StridedSliceCompute<1>(ctx); - break; - case 2: - StridedSliceCompute<2>(ctx); - break; - case 3: - StridedSliceCompute<3>(ctx); - break; - case 4: - StridedSliceCompute<4>(ctx); - break; - case 5: - StridedSliceCompute<5>(ctx); - break; - case 6: - StridedSliceCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input is supported up to 6.")); - break; - } - } - - private: - template - void StridedSliceCompute(const framework::ExecutionContext& ctx) const { - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - auto in = ctx.Input("Input"); - auto out = ctx.Output("Out"); - auto in_dims = in->dims(); - - // list - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - auto strides_int = ctx.Attr>("strides"); - - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - std::vector strides(strides_int.begin(), strides_int.end()); - - auto axes = ctx.Attr>("axes"); - auto infer_flags = ctx.Attr>("infer_flags"); - auto decrease_axis = ctx.Attr>("decrease_axis"); - - // vector> - auto list_new_ends_tensor = - ctx.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - ctx.MultiInput("StartsTensorList"); - auto list_new_strides_tensor = - ctx.MultiInput("StridesTensorList"); - - // phi::DenseTensor - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } else if (ctx.HasInput("StartsTensor")) { - auto* starts_tensor = ctx.Input("StartsTensor"); - starts = phi::GetVectorFromTensor(starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } else if (ctx.HasInput("EndsTensor")) { - auto* ends_tensor = ctx.Input("EndsTensor"); - ends = phi::GetVectorFromTensor(ends_tensor); - } - - if (list_new_strides_tensor.size() > 0) { - strides = GetDataFromTensorList(list_new_strides_tensor); - } else if (ctx.HasInput("StridesTensor")) { - auto* strides_tensor = ctx.Input("StridesTensor"); - strides = phi::GetVectorFromTensor(strides_tensor); - } - - // out dims calculation - std::vector out_dims_vector(in_dims.size(), -1); - phi::funcs::StridedSliceOutDims(starts, - ends, - strides, - axes, - infer_flags, - in_dims, - decrease_axis, - out_dims_vector.data(), - axes.size(), - false); - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - // check whether need to reverse (false: stride > 0; true: stride < 0) - std::vector reverse_vector(starts.size(), 0); - phi::funcs::StridedSliceFunctor(starts.data(), - ends.data(), - strides.data(), - axes.data(), - reverse_vector.data(), - in_dims, - infer_flags, - decrease_axis, - starts.size()); - - // construct the starts_indices, ends_indices and strides_indices tensor for - // calling StridedSlice op - std::vector starts_indices_vector(D, 0); - std::vector ends_indices_vector(out_dims_vector.begin(), - out_dims_vector.end()); - std::vector strides_indices_vector(D, 1); - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices_vector[axis_index] = starts[axis]; - ends_indices_vector[axis_index] = ends[axis]; - strides_indices_vector[axis_index] = strides[axis]; - } - - phi::DenseTensor starts_indices_tensor; - phi::DenseTensor ends_indices_tensor; - phi::DenseTensor strides_indices_tensor; - - starts_indices_tensor.mutable_data({D}, place); - ends_indices_tensor.mutable_data({D}, place); - strides_indices_tensor.mutable_data({D}, place); - - paddle::framework::TensorFromVector( - starts_indices_vector, ctx.device_context(), &starts_indices_tensor); - paddle::framework::TensorFromVector( - ends_indices_vector, ctx.device_context(), &ends_indices_tensor); - paddle::framework::TensorFromVector( - strides_indices_vector, ctx.device_context(), &strides_indices_tensor); - - auto out_dims_origin = out_dims; - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - PADDLE_ENFORCE_EQ( - out_dims[decrease_axis[i]], - 1, - platform::errors::InvalidArgument( - "the size of decrease dimension should be 1, but received %d.", - out_dims[decrease_axis[i]])); - out_dims_origin[decrease_axis[i]] = 0; - } - - for (int i = 0; i < out_dims_origin.size(); ++i) { - if (out_dims_origin[i] != 0) { - new_out_shape.push_back(out_dims_origin[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } - out_dims_origin = phi::make_ddim(new_out_shape); - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - out->Resize(out_dims); - out->mutable_data(place); - - const auto& runner = NpuOpRunner("StridedSlice", - {*in, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor}, - {*out}, - {{"begin_mask", 0}, - {"end_mask", 0}, - {"ellipsis_mask", 0}, - {"new_axis_mask", 0}, - {"shrink_axis_mask", 0}}); - runner.Run(stream); - - if (need_reverse) { - phi::DenseTensor out_tmp; - out_tmp.mutable_data(out_dims, place); - paddle::framework::TensorCopy( - *out, - place, - ctx.template device_context(), - &out_tmp); - - phi::DenseTensor reverse_axis; - std::vector reverse_axis_vector; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - reverse_axis_vector.push_back(axes[axis]); - } - } - reverse_axis.mutable_data( - {static_cast(reverse_axis_vector.size())}, place); - paddle::framework::TensorFromVector( - reverse_axis_vector, ctx.device_context(), &reverse_axis); - - const auto& runner_reverse = - NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out}); - runner_reverse.Run(stream); - } - - if (decrease_axis.size() > 0) { - out->Resize(out_dims_origin); - } - } -}; - -template -class StridedSliceGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - PADDLE_ENFORCE_EQ(is_tensor_array, - false, - platform::errors::InvalidArgument( - "phi::DenseTensor array as input is not supported.")); - int rank = ctx.Input("Input")->dims().size(); - - switch (rank) { - case 1: - StridedSliceGradCompute<1>(ctx); - break; - case 2: - StridedSliceGradCompute<2>(ctx); - break; - case 3: - StridedSliceGradCompute<3>(ctx); - break; - case 4: - StridedSliceGradCompute<4>(ctx); - break; - case 5: - StridedSliceGradCompute<5>(ctx); - break; - case 6: - StridedSliceGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input is supported up to 6.")); - break; - } - } - - private: - template - void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const { - auto place = ctx.GetPlace(); - auto& dev_ctx = - ctx.template device_context(); - - auto* input = ctx.Input("Input"); - auto input_dims = input->dims(); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("Input")); - dx->mutable_data(input_dims, place); - - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - auto strides_int = ctx.Attr>("strides"); - - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - std::vector strides(strides_int.begin(), strides_int.end()); - - auto axes = ctx.Attr>("axes"); - auto infer_flags = ctx.Attr>("infer_flags"); - auto decrease_axis = ctx.Attr>("decrease_axis"); - - auto list_new_ends_tensor = - ctx.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - ctx.MultiInput("StartsTensorList"); - auto list_new_strides_tensor = - ctx.MultiInput("StridesTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } else if (ctx.HasInput("StartsTensor")) { - auto* starts_tensor = ctx.Input("StartsTensor"); - starts = phi::GetVectorFromTensor(starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } else if (ctx.HasInput("EndsTensor")) { - auto* ends_tensor = ctx.Input("EndsTensor"); - ends = phi::GetVectorFromTensor(ends_tensor); - } - - if (list_new_strides_tensor.size() > 0) { - strides = GetDataFromTensorList(list_new_strides_tensor); - } else if (ctx.HasInput("StridesTensor")) { - auto* strides_tensor = ctx.Input("StridesTensor"); - strides = phi::GetVectorFromTensor(strides_tensor); - } - - std::vector out_dims_vector(input_dims.size(), -1); - phi::funcs::StridedSliceOutDims(starts, - ends, - strides, - axes, - infer_flags, - input_dims, - decrease_axis, - out_dims_vector.data(), - axes.size(), - false); - - std::vector reverse_vector(starts.size(), 0); - phi::funcs::StridedSliceFunctor(starts.data(), - ends.data(), - strides.data(), - axes.data(), - reverse_vector.data(), - input_dims, - infer_flags, - decrease_axis, - starts.size()); - - std::vector starts_indices_vector(D, 0); - std::vector ends_indices_vector(out_dims_vector.begin(), - out_dims_vector.end()); - std::vector strides_indices_vector(D, 1); - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices_vector[axis_index] = starts[axis]; - ends_indices_vector[axis_index] = ends[axis]; - strides_indices_vector[axis_index] = strides[axis]; - } - - phi::DenseTensor starts_indices_tensor; - phi::DenseTensor ends_indices_tensor; - phi::DenseTensor strides_indices_tensor; - - starts_indices_tensor.mutable_data({D}, place); - ends_indices_tensor.mutable_data({D}, place); - strides_indices_tensor.mutable_data({D}, place); - - paddle::framework::TensorFromVector( - starts_indices_vector, dev_ctx, &starts_indices_tensor); - paddle::framework::TensorFromVector( - ends_indices_vector, dev_ctx, &ends_indices_tensor); - paddle::framework::TensorFromVector( - strides_indices_vector, dev_ctx, &strides_indices_tensor); - - std::vector input_dims_vector; - for (int i = 0; i < input_dims.size(); i++) { - input_dims_vector.push_back(input_dims[i]); - } - phi::DenseTensor input_dims_tensor; - paddle::framework::TensorFromVector( - input_dims_vector, dev_ctx, &input_dims_tensor); - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto stream = dev_ctx.stream(); - framework::NPUAttributeMap attr_input = {{"begin_mask", 0}, - {"end_mask", 0}, - {"ellipsis_mask", 0}, - {"new_axis_mask", 0}, - {"shrink_axis_mask", 0}}; - - if (need_reverse) { - phi::DenseTensor reverse_axis; - std::vector reverse_axis_vector; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - reverse_axis_vector.push_back(axes[axis]); - } - } - reverse_axis.mutable_data( - {static_cast(reverse_axis_vector.size())}, place); - paddle::framework::TensorFromVector( - reverse_axis_vector, dev_ctx, &reverse_axis); - - phi::DenseTensor dout_tmp; - dout_tmp.mutable_data(dout->dims(), place); - const auto& runner_reverse = - NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp}); - runner_reverse.Run(stream); - - const auto& runner = NpuOpRunner("StridedSliceGrad", - {input_dims_tensor, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor, - dout_tmp}, - {*dx}, - attr_input); - runner.Run(stream); - } else { - const auto& runner = NpuOpRunner("StridedSliceGrad", - {input_dims_tensor, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor, - *dout}, - {*dx}, - attr_input); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - strided_slice, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel); - -REGISTER_OP_NPU_KERNEL( - strided_slice_grad, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel); diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc deleted file mode 100644 index 5d1656b79e9a8536fd81e00ead0a6b9a0e76c25d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sum_op_npu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using SelectedRows = phi::SelectedRows; - -template -class SumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto out_var = ctx.OutputVar("Out"); - if (out_var->IsType()) { - auto *out = out_var->GetMutable(); - auto x = ctx.MultiInput("X"); - out->mutable_data(ctx.GetPlace()); - - auto place = ctx.GetPlace(); - - int n = static_cast(x.size()); - if (n == 1) { - paddle::framework::TensorCopy(*x[0], place, out); - return; - } - - std::vector inputs; - std::vector names; - for (int i = 0; i < n; ++i) { - if (x[i] && x[i]->numel() > 0) { - inputs.push_back(*x[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; - } - } - - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; - runner.AddInputNames(names); - runner.Run(stream); - } else if (out_var->IsType()) { - auto in_vars = ctx.MultiInputVar("X"); - bool in_place = out_var == in_vars[0]; - auto &out_array = *out_var->GetMutable(); - for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { - PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), - true, - platform::errors::InvalidArgument( - "Only support all inputs are TensorArray, " - "but inputs[%d] is not TensorArray.", - i)); - auto &in_array = in_vars[i]->Get(); - - for (size_t i = 0; i < in_array.size(); ++i) { - if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) { - if (i >= out_array.size()) { - out_array.resize(i + 1); - } - if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) { - framework::TensorCopy(in_array[i], - in_array[i].place(), - ctx.device_context(), - &out_array[i]); - out_array[i].set_lod(in_array[i].lod()); - } else { - PADDLE_ENFORCE_EQ( - out_array[i].lod(), - in_array[i].lod(), - platform::errors::InvalidArgument( - "The lod message between inputs[%d] and" - " outputs[%d] must be same, but now is not same.", - i, - i)); - auto stream = ctx.template device_context< - paddle::platform::NPUDeviceContext>() - .stream(); - NpuOpRunner runner{ - "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}}; - runner.Run(stream); - } - } - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected type of Output(out) must be phi::DenseTensor or " - "LoDTensorArray. But got " - "unsupport type: %s.", - framework::ToTypeName(out_var->Type()))); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - sum, - ops::SumNPUKernel, - ops::SumNPUKernel); diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc deleted file mode 100644 index 1b3ed3ccc7a737afed80da0cf17850ff1ed823ac..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc +++ /dev/null @@ -1,1105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/platform/collective_helper.h" - -namespace paddle { -namespace operators { - -template -void training_or_inference(const framework::ExecutionContext &ctx, - const aclrtStream &stream, - const platform::Place &place, - const DataLayout &layout, - const bool &test_mode, - const int &N, - const int &C, - const int &H, - const int &W, - const float epsilon, - const float &momentum, - const phi::DenseTensor *common_mean, - const phi::DenseTensor *common_var, - const phi::DenseTensor *x, - const phi::DenseTensor *scale, - const phi::DenseTensor *bias, - const phi::DenseTensor *mean, - const phi::DenseTensor *variance, - phi::DenseTensor *mean_out, - phi::DenseTensor *variance_out, - phi::DenseTensor *saved_mean, - phi::DenseTensor *saved_variance, - phi::DenseTensor *y) { - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - std::vector multiples; - if (layout == phi::DataLayout::kNCHW) - multiples = {N, 1, H, W}; - else if (layout == phi::DataLayout::kNHWC) - multiples = {N, H, W, 1}; - - phi::DenseTensor common_mean_tile_1; - { - common_mean_tile_1.Resize({C}); - common_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - common_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - common_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor common_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - common_mean_tile.Resize(x->dims()); - common_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {common_mean_tile_1}, {common_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_1; - { - common_var_tile_1.Resize({C}); - common_var_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1); - if (layout == phi::DataLayout::kNCHW) - common_var_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - common_var_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor common_var_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - common_var_tile.Resize(x->dims()); - common_var_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {common_var_tile_1}, {common_var_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - common_var_tile_add_epsilon.Resize(x->dims()); - common_var_tile_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {common_var_tile}, {common_var_tile_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_add_epsilon_sqrt; - { - common_var_tile_add_epsilon_sqrt.Resize(x->dims()); - common_var_tile_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner("Sqrt", - {common_var_tile_add_epsilon}, - {common_var_tile_add_epsilon_sqrt}, - {}); - runner.Run(stream); - } - - phi::DenseTensor x_sub_common_mean; - { - x_sub_common_mean.Resize(x->dims()); - x_sub_common_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*x, common_mean_tile}, {x_sub_common_mean}, {}); - runner.Run(stream); - } - - phi::DenseTensor normalized; - { - normalized.Resize(x->dims()); - normalized.mutable_data(place); - const auto &runner = - NpuOpRunner("Div", - {x_sub_common_mean, common_var_tile_add_epsilon_sqrt}, - {normalized}, - {}); - runner.Run(stream); - } - - phi::DenseTensor scale_tile_1; - { - scale_tile_1.Resize({C}); - scale_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); - if (layout == phi::DataLayout::kNCHW) - scale_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - scale_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor scale_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - scale_tile.Resize(x->dims()); - scale_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor normalized_mul_scale; - { - normalized_mul_scale.Resize(x->dims()); - normalized_mul_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {normalized, scale_tile}, {normalized_mul_scale}, {}); - runner.Run(stream); - } - - phi::DenseTensor bias_tile_1; - { - bias_tile_1.Resize({C}); - bias_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*bias, place, &bias_tile_1); - if (layout == phi::DataLayout::kNCHW) - bias_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - bias_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor bias_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - bias_tile.Resize(x->dims()); - bias_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {bias_tile_1}, {bias_tile}, attr_input); - runner.Run(stream); - } - - // calculate y - { - y->mutable_data(place); - const auto &runner = - NpuOpRunner("Add", {normalized_mul_scale, bias_tile}, {*y}, {}); - runner.Run(stream); - } - - if (!test_mode) { - phi::DenseTensor ones; - { - ones.Resize({C}); - ones.mutable_data(place); - FillNpuTensorWithConstant(&ones, 1); - } - - // cacl mean_out - { - phi::DenseTensor common_mean_mul_1_sub_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; - common_mean_mul_1_sub_momentum.Resize({C}); - common_mean_mul_1_sub_momentum.mutable_data(place); - const auto &runner = NpuOpRunner("Muls", - {*common_mean}, - {common_mean_mul_1_sub_momentum}, - attr_input); - runner.Run(stream); - } - - phi::DenseTensor mean_mul_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", momentum}}; - mean_mul_momentum.Resize({C}); - mean_mul_momentum.mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {*mean}, {mean_mul_momentum}, attr_input); - runner.Run(stream); - } - - mean_out->mutable_data(place); - - const auto &runner = - NpuOpRunner("Add", - {common_mean_mul_1_sub_momentum, mean_mul_momentum}, - {*mean_out}, - {}); - runner.Run(stream); - } - - // cacl variance_out - { - phi::DenseTensor momentum_mul_var; - { - framework::NPUAttributeMap attr_input = {{"value", momentum}}; - momentum_mul_var.Resize({C}); - momentum_mul_var.mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {*variance}, {momentum_mul_var}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_mul_1_sub_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; - var_ref_mul_1_sub_momentum.Resize({C}); - var_ref_mul_1_sub_momentum.mutable_data(place); - const auto &runner = NpuOpRunner( - "Muls", {*common_var}, {var_ref_mul_1_sub_momentum}, attr_input); - runner.Run(stream); - } - - variance_out->mutable_data(place); - - const auto &runner = - NpuOpRunner("Add", - {var_ref_mul_1_sub_momentum, momentum_mul_var}, - {*variance_out}, - {}); - runner.Run(stream); - } - - // cacl saved_variance - { - phi::DenseTensor var_ref_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - var_ref_add_epsilon.Resize({C}); - var_ref_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {*common_var}, {var_ref_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_add_epsilon_sqrt; - { - var_ref_add_epsilon_sqrt.Resize({C}); - var_ref_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner( - "Sqrt", {var_ref_add_epsilon}, {var_ref_add_epsilon_sqrt}, {}); - runner.Run(stream); - } - - saved_variance->mutable_data(place); - - const auto &runner = NpuOpRunner( - "Div", {ones, var_ref_add_epsilon_sqrt}, {*saved_variance}, {}); - runner.Run(stream); - } - } -} - -template -class SyncBatchNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const std::string layout_str = ctx.Attr("data_layout"); - const DataLayout layout = phi::StringToDataLayout(layout_str); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - - PADDLE_ENFORCE_EQ(use_global_stats, - false, - platform::errors::InvalidArgument( - "sync_batch_norm doesn't support " - "to set use_global_stats True. Please use batch_norm " - "in this case.")); - - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Y"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *mean = ctx.Input("Mean"); - const auto *variance = ctx.Input("Variance"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), - 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, - x_dims.size())); - - int N, C, H, W, D; - phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); - - int x_numel = x->numel(); - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - bool test_mode = is_test && (!trainable_stats); - if (test_mode) { // inference - // cacl saved_mean - saved_mean->mutable_data(place); - paddle::framework::TensorCopySync(*mean, place, saved_mean); - - // cacl saved_variance - saved_variance->mutable_data(place); - paddle::framework::TensorCopySync(*variance, place, saved_variance); - - // cacl y - training_or_inference(ctx, - stream, - place, - layout, - test_mode, - N, - C, - H, - W, - epsilon, - momentum, - mean, - variance, - x, - scale, - bias, - mean, - variance, - NULL, - NULL, - NULL, - NULL, - y); - - } else { // training - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - phi::DenseTensor mom_cpu; - paddle::framework::TensorCopySync( - *mom_tensor, platform::CPUPlace(), &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // cacl saved_mean and var_ref - phi::DenseTensor var_ref; - var_ref.Resize({C}); - var_ref.mutable_data(place); - { - phi::DenseTensor x_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_sum.Resize({C}); - x_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*x}, {x_sum}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_square; - { - x_square.Resize(x->dims()); - x_square.mutable_data(place); - const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_square_sum.Resize({C}); - x_square_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); - runner.Run(stream); - } - - auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); - - float device_counts = 0.0; - if (comm) { - HcclDataType dtype = platform::ToHCCLDataType( - framework::TransToProtoVarType(mean_out->dtype())); - - phi::DenseTensor device_count_tensor; - { - device_count_tensor.Resize({1}); - device_count_tensor.mutable_data(place); - FillNpuTensorWithConstant(&device_count_tensor, 1); - } - - // HcclAllReduce device_count_tensor - { - void *sendbuff = reinterpret_cast( - const_cast(device_count_tensor.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - 1, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - std::vector device_count_vec(1); - paddle::framework::TensorToVector( - device_count_tensor, ctx.device_context(), &device_count_vec); - device_counts = device_count_vec[0]; - - // HcclAllReduce x_sum - { - void *sendbuff = reinterpret_cast( - const_cast(x_sum.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - // HcclAllReduce x_square_sum - { - void *sendbuff = reinterpret_cast( - const_cast(x_square_sum.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - } - - // cacl saved_mean - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel / device_counts}}; - saved_mean->mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {x_sum}, {*saved_mean}, attr_input); - runner.Run(stream); - } - - // cacl var_ref - { - phi::DenseTensor saved_mean_square; - { - saved_mean_square.Resize({C}); - saved_mean_square.mutable_data(place); - const auto &runner = - NpuOpRunner("Square", {*saved_mean}, {saved_mean_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tmp; - var_ref_tmp.Resize({C}); - var_ref_tmp.mutable_data(place); - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel / device_counts}}; - const auto &runner = - NpuOpRunner("Muls", {x_square_sum}, {var_ref_tmp}, attr_input); - runner.Run(stream); - } - - // cacl var_ref - { - const auto &runner = NpuOpRunner( - "Sub", {var_ref_tmp, saved_mean_square}, {var_ref}, {}); - runner.Run(stream); - } - } - } - - training_or_inference(ctx, - stream, - place, - layout, - test_mode, - N, - C, - H, - W, - epsilon, - momentum, - saved_mean, - &var_ref, - x, - scale, - bias, - mean, - variance, - mean_out, - variance_out, - saved_mean, - saved_variance, - y); - } - } -}; - -template -class SyncBatchNormNPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - float epsilon = ctx.Attr("epsilon"); - const std::string layout_str = ctx.Attr("data_layout"); - const DataLayout layout = phi::StringToDataLayout(layout_str); - - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - const auto *saved_mean = ctx.Input("SavedMean"); - - const phi::DenseTensor *x; - if (ctx.HasInput("Y")) { - PADDLE_ENFORCE_EQ(true, - false, - platform::errors::InvalidArgument( - "sync_batch_norm_grad doesn't support input Y")); - } else { - x = ctx.Input("X"); - } - - int N, C, H, W, D; - phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D); - - int x_numel = x->numel(); - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - std::vector multiples; - if (layout == phi::DataLayout::kNCHW) - multiples = {N, 1, H, W}; - else if (layout == phi::DataLayout::kNHWC) - multiples = {N, H, W, 1}; - - auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); - HcclDataType dtype = platform::ToHCCLDataType( - framework::TransToProtoVarType(scale->dtype())); - - float device_counts = 0.0; - if (comm) { - phi::DenseTensor device_count_tensor; - { - device_count_tensor.Resize({1}); - device_count_tensor.mutable_data(place); - FillNpuTensorWithConstant(&device_count_tensor, 1); - } - - // HcclAllReduce device_count_tensor - { - void *sendbuff = reinterpret_cast( - const_cast(device_count_tensor.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclAllReduce(sendbuff, - recvbuff, - 1, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - std::vector device_count_vec(1); - paddle::framework::TensorToVector( - device_count_tensor, ctx.device_context(), &device_count_vec); - device_counts = device_count_vec[0]; - PADDLE_ENFORCE_GE( - device_counts, - 2, - platform::errors::PreconditionNotMet("device_counts should >= 2.")); - } - - // cacl var_ref - phi::DenseTensor var_ref; - var_ref.Resize({C}); - var_ref.mutable_data(place); - { - // cacl var_ref - { - phi::DenseTensor x_square; - { - x_square.Resize(x->dims()); - x_square.mutable_data(place); - const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_square_sum.Resize({C}); - x_square_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum_mean; - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel}}; - x_square_sum_mean.Resize({C}); - x_square_sum_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Muls", {x_square_sum}, {x_square_sum_mean}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor mean_square; - { - mean_square.Resize({C}); - mean_square.mutable_data(place); - const auto &runner = - NpuOpRunner("Square", {*saved_mean}, {mean_square}, {}); - runner.Run(stream); - } - - // cacl var_ref - { - const auto &runner = NpuOpRunner( - "Sub", {x_square_sum_mean, mean_square}, {var_ref}, {}); - runner.Run(stream); - } - } - } - - phi::DenseTensor saved_mean_tile_1; - { - saved_mean_tile_1.Resize({C}); - saved_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - saved_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - saved_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor saved_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - saved_mean_tile.Resize(x->dims()); - saved_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {saved_mean_tile_1}, {saved_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_sub_saved_mean; - { - x_sub_saved_mean.Resize(x->dims()); - x_sub_saved_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*x, saved_mean_tile}, {x_sub_saved_mean}, {}); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_1; - { - var_ref_tile_1.Resize({C}); - var_ref_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1); - if (layout == phi::DataLayout::kNCHW) - var_ref_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - var_ref_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor var_ref_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - var_ref_tile.Resize(x->dims()); - var_ref_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {var_ref_tile_1}, {var_ref_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - var_ref_tile_add_epsilon.Resize(x->dims()); - var_ref_tile_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {var_ref_tile}, {var_ref_tile_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_add_epsilon_sqrt; - { - var_ref_tile_add_epsilon_sqrt.Resize(x->dims()); - var_ref_tile_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner("Sqrt", - {var_ref_tile_add_epsilon}, - {var_ref_tile_add_epsilon_sqrt}, - {}); - runner.Run(stream); - } - - phi::DenseTensor dy_mul_x_sub_mean_for_scale; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_mul_x_sub_mean_for_scale.Resize(x->dims()); - dy_mul_x_sub_mean_for_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {}); - runner.Run(stream); - } else { - dy_mul_x_sub_mean_for_scale.Resize(x->dims()); - dy_mul_x_sub_mean_for_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {}); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mul_x_sub_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_mul_x_sub_mean.Resize(x->dims()); - dy_mul_x_sub_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {}); - runner.Run(stream); - } else { - dy_mul_x_sub_mean.Resize(x->dims()); - dy_mul_x_sub_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {}); - runner.Run(stream); - } - } - - // HcclAllReduce dy_mul_x_sub_mean - if (comm) { - { - void *sendbuff = reinterpret_cast( - const_cast(dy_mul_x_sub_mean.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclAllReduce(sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f / device_counts}}; - const auto &runner = NpuOpRunner( - "Muls", {dy_mul_x_sub_mean}, {dy_mul_x_sub_mean}, attr_input); - runner.Run(stream); - } - } - - // cacl d_x - if (d_x) { - phi::DenseTensor dy_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mean.Resize({C}); - dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); - runner.Run(stream); - } else { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mean.Resize({C}); - dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); - runner.Run(stream); - } - } - - // HcclAllReduce dy_mean - if (comm) { - { - void *sendbuff = reinterpret_cast( - const_cast(dy_mean.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f / device_counts}}; - const auto &runner = - NpuOpRunner("Muls", {dy_mean}, {dy_mean}, attr_input); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mean_tile_1; - { - dy_mean_tile_1.Resize({C}); - dy_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - dy_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - dy_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor dy_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - dy_mean_tile.Resize(x->dims()); - dy_mean_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {dy_mean_tile_1}, {dy_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor dy_sub_dy_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_sub_dy_mean.Resize(x->dims()); - dy_sub_dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); - runner.Run(stream); - } else { - dy_sub_dy_mean.Resize(x->dims()); - dy_sub_dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mul_x_sub_mean_mean; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mul_x_sub_mean_mean.Resize({C}); - dy_mul_x_sub_mean_mean.mutable_data(place); - const auto &runner = NpuOpRunner("ReduceMeanD", - {dy_mul_x_sub_mean}, - {dy_mul_x_sub_mean_mean}, - attr_input); - runner.Run(stream); - } - - phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1; - { - dy_mul_x_sub_mean_mean_tile_1.Resize({C}); - dy_mul_x_sub_mean_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync( - dy_mul_x_sub_mean_mean, place, &dy_mul_x_sub_mean_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor dy_mul_x_sub_mean_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - dy_mul_x_sub_mean_mean_tile.Resize(x->dims()); - dy_mul_x_sub_mean_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner("TileD", - {dy_mul_x_sub_mean_mean_tile_1}, - {dy_mul_x_sub_mean_mean_tile}, - attr_input); - runner.Run(stream); - } - - // (x - mean) * np.mean(dy * (x - mean), axis=axis) - // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile - phi::DenseTensor tmp1; - { - tmp1.Resize(x->dims()); - tmp1.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {x_sub_saved_mean, dy_mul_x_sub_mean_mean_tile}, {tmp1}, {}); - runner.Run(stream); - } - - // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon) - // tmp1 / (var + epsilon) - // tmp1 / var_ref_tile_add_epsilon - phi::DenseTensor tmp2; - { - tmp2.Resize(x->dims()); - tmp2.mutable_data(place); - const auto &runner = - NpuOpRunner("Div", {tmp1, var_ref_tile_add_epsilon}, {tmp2}, {}); - runner.Run(stream); - } - - // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) / - // (var + epsilon) - // dy_sub_dy_mean - tmp2 - phi::DenseTensor tmp3; - { - tmp3.Resize(x->dims()); - tmp3.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {dy_sub_dy_mean, tmp2}, {tmp3}, {}); - runner.Run(stream); - } - - phi::DenseTensor scale_tile_1; - { - scale_tile_1.Resize({C}); - scale_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); - if (layout == phi::DataLayout::kNCHW) - scale_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - scale_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor scale_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - scale_tile.Resize(x->dims()); - scale_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); - runner.Run(stream); - } - - // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), - // axis) / (var + epsilon)) - // scale * tmp3 - phi::DenseTensor dx_1; - { - dx_1.Resize(x->dims()); - dx_1.mutable_data(place); - - const auto &runner = NpuOpRunner("Mul", {scale_tile, tmp3}, {dx_1}, {}); - runner.Run(stream); - } - - // dx_1 / var_ref_tile_add_epsilon_sqrt - { - d_x->Resize(x->dims()); - d_x->mutable_data(place); - const auto &runner = NpuOpRunner( - "Div", {dx_1, var_ref_tile_add_epsilon_sqrt}, {*d_x}, {}); - runner.Run(stream); - } - } - - // cacl d_scale - if (d_scale) { - phi::DenseTensor d_scale_2; - { - d_scale_2.Resize(x->dims()); - d_scale_2.mutable_data(place); - const auto &runner = NpuOpRunner( - "Div", - {dy_mul_x_sub_mean_for_scale, var_ref_tile_add_epsilon_sqrt}, - {d_scale_2}, - {}); - runner.Run(stream); - } - - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_scale->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {d_scale_2}, {*d_scale}, attr_input); - runner.Run(stream); - } - } - - // cacl d_bias - if (d_bias) { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_bias->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); - runner.Run(stream); - } else { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_bias->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); - runner.Run(stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - sync_batch_norm, - ops::SyncBatchNormNPUKernel); -REGISTER_OP_NPU_KERNEL( - sync_batch_norm_grad, - ops::SyncBatchNormNPUGradKernel); diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc deleted file mode 100644 index ce10caf1b2e19b6d855ad5994117212cfe81e00a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op_npu.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in -// cmake/operators.cmake when Paddle supports -#if (CANN_VERSION_CODE >= 504000) - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class NPUTakeAlongAxisKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner( - "GatherElements", {*input, *index}, {*result}, {{"dim", axis}}); - runner.Run(stream); - } -}; - -template -class NPUTakeAlongAxisGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result_grad = - ctx.Input(framework::GradVarName("Result")); - - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - input_grad->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ScatterAddWithAxis", - {*input_grad, *index, *result_grad}, - {*input_grad}, - {{"axis", axis}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - take_along_axis, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel) -REGISTER_OP_NPU_KERNEL( - take_along_axis_grad, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel) - -#endif diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc deleted file mode 100644 index 2e3ab9dac046130ba66e04d8ae67e3c630883b0c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tile_op_npu.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/tile_op_functor.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class TileNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, - 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - repeat_times_size)); - rank = std::max(rank, repeat_times_size); - Tile(context); - } - - protected: - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], - 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), - vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), - repeat_times.size())); - auto* out0 = context.Output("Out"); - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - out0->mutable_data(context.GetPlace()); - - std::vector temp(repeat_times.size(), 1); - if (repeat_times == temp) { - framework::TensorCopy(*in0, - context.GetPlace(), - context.template device_context(), - out0); - return; - } - - // const auto& runner = - // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = context.template device_context().stream(); - NpuOpRunner runner; - runner.SetType("Tile") - .AddInput(*in0) - .AddInput(std::move(repeat_times)) - .AddOutput(*out0) - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(tile, - ops::TileNPUKernel, - ops::TileNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TileNPUKernel, -#endif - ops::TileNPUKernel, - ops::TileNPUKernel); diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc deleted file mode 100644 index 478523721458dd75a6692e6d77abe24534284961..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/top_k_op.h" - -namespace paddle { -namespace operators { - -void gen_assist_seq(phi::DenseTensor* assit_tensor, - int64_t dim, - const framework::ExecutionContext& ctx) { - const int64_t dimx2 = dim; - std::vector assit; - assit.resize(2 * dimx2); - for (int64_t i = 0; i < dimx2; i++) { - // for i in range [0, dim] - assit[i] = static_cast(i); - - // for i in range [dim, dimx2] - int64_t idx = - static_cast(static_cast(i)); - int64_t gap = i - idx; - assit[i + dim] = static_cast(gap); - } - framework::TensorFromVector(assit, ctx.device_context(), assit_tensor); -} - -template -class TopkNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // read input - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - size_t k = static_cast(ctx.Attr("k")); - - output->mutable_data(ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - // prepare assit - auto size = input->dims().size(); - // dim is the last dimension of input - auto dim = input->dims()[size - 1]; - phi::DenseTensor assist_seq_tensor; - assist_seq_tensor.Resize({2 * dim}); - assist_seq_tensor.mutable_data(ctx.GetPlace()); - gen_assist_seq(&assist_seq_tensor, dim, ctx); - - framework::NPUAttributeMap attr_input = {{"sorted", "true"}, - {"k", static_cast(k)}, - {"dim", -1}, - {"largest", true}}; - - phi::DenseTensor tmp_indices(phi::DataType::INT32); - tmp_indices.Resize(indices->dims()); - tmp_indices.mutable_data(ctx.GetPlace()); - - // run ascend - const auto& runner = NpuOpRunner("TopKD", - {*input, assist_seq_tensor}, - {*output, tmp_indices}, - attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - // cast indices from INT32 to INT64 - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(indices->dtype())); - const auto& runner_cast_indices = - NpuOpRunner("Cast", - {tmp_indices}, - {*indices}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_indices.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// Ascend Op TopKD only support input float 16 dtype -REGISTER_OP_NPU_KERNEL(top_k, - ops::TopkNPUKernel); diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc deleted file mode 100644 index 4e0b0650b9af6825e0a0bda40babb5974a391493..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { -// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel -// may lead to large accuracy error for float32 data -template -class TopkV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* k_tensor = context.Input("K"); - auto* out = context.Output("Out"); - auto* indices = context.Output("Indices"); // type: INT64 - - int32_t k = static_cast(context.Attr("k")); - int axis = static_cast(context.Attr("axis")); - const bool sorted = static_cast(context.Attr("sorted")); - const bool largest = static_cast(context.Attr("largest")); - - if (axis < 0) { - axis += input->dims().size(); - } - - if (k_tensor != nullptr) { - std::vector v_tmp(1); - paddle::framework::TensorToVector( - *k_tensor, - context.template device_context(), - &v_tmp); - k = static_cast(v_tmp[0]); - } - - framework::DDim output_dims = input->dims(); - output_dims[axis] = k; - - out->Resize(output_dims); - indices->Resize(output_dims); - - out->mutable_data(context.GetPlace()); - indices->mutable_data(context.GetPlace()); - - phi::DenseTensor indices_int32(phi::DataType::INT32); - indices_int32.Resize(output_dims); - indices_int32.mutable_data(context.GetPlace()); - - auto npu_stream = - context.template device_context() - .stream(); - - NpuOpRunner npu_op_runner_topkv2; - npu_op_runner_topkv2.SetType("TopKV2") - .AddInput(*input) - .AddInput(std::vector{k}) - .AddOutput(*out) - .AddOutput(indices_int32) - .AddAttr("sorted", sorted) - .AddAttr("dim", axis) - .AddAttr("largest", largest) - .Run(npu_stream); - - // Cast 'indices_int32' to 'indices', from INT32 to INT64 - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(indices->type())); - const auto& npu_op_runner_cast = - NpuOpRunner("Cast", - {indices_int32}, - {*indices}, - {{"dst_type", static_cast(dst_dtype)}}); - npu_op_runner_cast.Run(npu_stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(top_k_v2, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc deleted file mode 100644 index 5af2edd60ce8f28ce8fc61e9995d31a729f7ec07..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_op.h" - -namespace paddle { -namespace operators { - -template -class TransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - std::vector axis = ctx.Attr>("axis"); - out->mutable_data(ctx.device_context().GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(*x) - .AddInput(std::move(axis)) - .AddOutput(*out); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class TransposeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - std::vector axis = ctx.Attr>("axis"); - std::vector reversed_axis(axis); - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - x_grad->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(*out_grad) - .AddInput(std::move(reversed_axis)) - .AddOutput(*x_grad); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - transpose2, - ops::TransposeNPUKernel, - ops::TransposeNPUKernel, - ops::TransposeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TransposeNPUKernel, -#endif - ops::TransposeNPUKernel, - ops::TransposeNPUKernel); - -REGISTER_OP_NPU_KERNEL(transpose2_grad, - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TransposeGradNPUKernel, -#endif - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc deleted file mode 100644 index 0ef5af349decfaf99a07632fb8de392e575144f7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(transpose2); -USE_OP_DEVICE_KERNEL(transpose2, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto out = scope->Var("Out"); - auto xshape = scope->Var("XShape"); - auto* x_t = x->GetMutable(); - auto* out_t = out->GetMutable(); - auto* xshape_t = xshape->GetMutable(); - auto place = ctx.GetPlace(); - - int dim0 = 2; - int dim1 = 3; - paddle::framework::TensorFromVector( - std::vector({0, 1, 2, 3, 4, 5}), ctx, x_t); - ctx.Wait(); - x_t->Resize({dim0, dim1}); - out_t->Resize({dim0, dim1}); - ctx.Wait(); - out_t->mutable_data(place); - ctx.Wait(); - xshape_t->Resize({dim0, dim1}); - xshape_t->mutable_data(place); - f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, - {"data_format", std::string("AnyLayout")}}; - auto op = f::OpRegistry::CreateOp("transpose2", - {{"X", {"X"}}}, - {{"Out", {"Out"}}, {"XShape", {"XShape"}}}, - attrs); - ctx.Wait(); - op->Run(*scope, place); - ctx.Wait(); - std::vector out_v; - paddle::framework::TensorToVector(*out_t, ctx, &out_v); - ctx.Wait(); - - EXPECT_EQ(out_t->numel(), dim0 * dim1); - EXPECT_EQ(out_v[0], 0); - EXPECT_EQ(out_v[1], 3); - EXPECT_EQ(out_v[2], 1); - EXPECT_EQ(out_v[3], 4); - EXPECT_EQ(out_v[4], 2); - EXPECT_EQ(out_v[5], 5); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto xshape = scope->Var("XShape"); - auto x_grad = scope->Var("X@GRAD"); - auto out_grad = scope->Var("Out@GRAD"); - - auto* x_grad_t = x_grad->GetMutable(); - auto* xshape_t = xshape->GetMutable(); - auto* out_grad_t = out_grad->GetMutable(); - - int dim0 = 2; - int dim1 = 3; - auto place = ctx.GetPlace(); - - paddle::framework::TensorFromVector( - std::vector({0, 1, 2, 3, 4, 5}), ctx, out_grad_t); - ctx.Wait(); - - x_grad_t->Resize({dim0, dim1}); - xshape_t->Resize( - {0, - dim0, - dim1}); // NOTE(zhiqiu): 0 is needed, see its infershape function - out_grad_t->Resize({dim0, dim1}); - - f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, - {"data_format", std::string("AnyLayout")}}; - - auto op = f::OpRegistry::CreateOp( - "transpose2_grad", - {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}}, - {{"X@GRAD", {"X@GRAD"}}}, - attrs); - - op->Run(*scope, place); - ctx.Wait(); - std::vector out_v; - paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v); - ctx.Wait(); - - EXPECT_EQ(x_grad_t->numel(), dim0 * dim1); - EXPECT_EQ(out_v[0], 0); - EXPECT_EQ(out_v[1], 3); - EXPECT_EQ(out_v[2], 1); - EXPECT_EQ(out_v[3], 4); - EXPECT_EQ(out_v[4], 2); - EXPECT_EQ(out_v[5], 5); -} - -TEST(transpose2, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(transpose2_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc deleted file mode 100644 index b47797a5bb131d730291a1a6c60615ca20001969..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int diagonal = ctx.Attr("diagonal"); - bool lower = ctx.Attr("lower"); - - out->mutable_data(ctx.GetPlace()); - - std::string op_type = lower ? "Tril" : "Triu"; - - framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}}; - - const auto& dev_ctx = - ctx.template device_context(); - - auto op_func_tril = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - auto op_func_triu = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::BOOL) { - if (lower) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attr_input, - dev_ctx, - op_func_tril, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } else { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attr_input, - dev_ctx, - op_func_triu, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } - } else { - const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - tril_triu, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc deleted file mode 100644 index da9fa93130bd191efb6b82baf1e78075286d5660..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" - -namespace paddle { -namespace operators { - -template -class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal - std::vector shape = ctx.Attr>("shape"); - phi::DenseTensor shape_tensor(phi::DataType::INT32); - shape_tensor.mutable_data({static_cast(shape.size())}, - ctx.GetPlace()); - paddle::framework::TensorFromVector( - shape, ctx.device_context(), &shape_tensor); - float mean = ctx.Attr("mean"); - phi::DenseTensor mean_tensor(phi::DataType::FLOAT32); - mean_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&mean_tensor, mean); - - float std = ctx.Attr("std"); - phi::DenseTensor std_tensor(phi::DataType::FLOAT32); - std_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&std_tensor, std); - - int32_t seed_var = ctx.Attr("seed"); - - phi::DenseTensor min_tensor(phi::DataType::FLOAT32); - min_tensor.mutable_data({1}, ctx.GetPlace()); - float min_value = mean - std * 2.0; - FillNpuTensorWithConstant(&min_tensor, min_value); - - phi::DenseTensor max_tensor(phi::DataType::FLOAT32); - max_tensor.mutable_data({1}, ctx.GetPlace()); - float max_value = mean + std * 2.0; - FillNpuTensorWithConstant(&max_tensor, max_value); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner( - "ParameterizedTruncatedNormal", - {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, - {*out}, - {{"seed", seed_var}}); - runner.Run(stream); - } -}; - -// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the -// above -// npu version work in the future. -template -class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - tensor->mutable_data(context.GetPlace()); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); - TruncatedNormal truncated_normal(mean, std); - int64_t size = tensor->numel(); - - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - cpu_data[i] = truncated_normal(dist(*engine)); - } - framework::TensorCopy( - cpu_tensor, - context.GetPlace(), - context.template device_context(), - tensor); - context.template device_context() - .Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(truncated_gaussian_random, - ops::NPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc deleted file mode 100644 index 5958a7751b8beb2b6f3c57311904a11dcc5c8a2a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/uniform_random_op_npu.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/uniform_random_op.h" -#include "paddle/phi/core/generator.h" - -namespace paddle { -namespace operators { - -template -class NPUUniformRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - phi::DenseTensor *tensor = nullptr; - auto out_var = ctx.OutputVar("Out"); - std::vector new_shape; - auto list_new_shape_tensor = - ctx.MultiInput("ShapeTensorList"); - if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) { - if (ctx.HasInput("ShapeTensor")) { - auto *shape_tensor = ctx.Input("ShapeTensor"); - new_shape = GetNewDataFromShapeTensor(shape_tensor); - } else if (list_new_shape_tensor.size() > 0) { - new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor); - } - } - - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); - tensor = selected_rows->mutable_value(); - auto shape = ctx.Attr>("shape"); - if (!new_shape.empty()) shape = new_shape; - tensor->Resize(phi::make_ddim(shape)); - selected_rows->mutable_rows()->reserve(shape[0]); - } else if (out_var->IsType()) { - tensor = out_var->GetMutable(); - if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected type of Output(out) in uniform_random_op must be " - "phi::DenseTensor, " - "SelectedRows. But got " - "unsupport type: %s.", - framework::ToTypeName(out_var->Type()))); - } - tensor->mutable_data(ctx.GetPlace()); - int64_t size = tensor->numel(); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T *data_cpu = cpu_tensor.mutable_data(platform::CPUPlace()); - - std::uniform_real_distribution dist( - static_cast(ctx.Attr("min")), - static_cast(ctx.Attr("max"))); - unsigned int seed = static_cast(ctx.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data_cpu[i] = dist(*engine); - } - - unsigned int diag_num = - static_cast(ctx.Attr("diag_num")); - unsigned int diag_step = - static_cast(ctx.Attr("diag_step")); - auto diag_val = static_cast(ctx.Attr("diag_val")); - if (diag_num > 0) { - PADDLE_ENFORCE_GT( - size, - (diag_num - 1) * (diag_step + 1), - platform::errors::InvalidArgument( - "ShapeInvalid: the diagonal's elements is equal (num-1) " - "* (step-1) with num %d, step %d," - "It should be smaller than %d, but received %d", - diag_num, - diag_step, - (diag_num - 1) * (diag_step + 1), - size)); - for (int64_t i = 0; i < diag_num; ++i) { - int64_t pos = i * diag_step + i; - data_cpu[pos] = diag_val; - } - } - - // copy to NPU - framework::TensorCopy( - cpu_tensor, - ctx.GetPlace(), - ctx.template device_context(), - tensor); - ctx.template device_context().Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL(uniform_random, - paddle::operators::NPUUniformRandomKernel); diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc deleted file mode 100644 index b2b09faaa9d44590cb0fad49812d1d4e2662da90..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unsqueeze_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc deleted file mode 100644 index bf66941f902788a3985f4ca3c0300eb95e131e33..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(unsqueeze); -USE_OP_DEVICE_KERNEL(unsqueeze, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - int dim0 = 5; - int dim1 = 10; - - std::vector init; - for (int64_t i = 0; i < dim0 * dim1; ++i) { - init.push_back(static_cast(0.1)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({dim0, dim1}); - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector axis; - axis.push_back(1); - f::AttributeMap attrs = {{"axes", axis}}; - - auto op = f::OpRegistry::CreateOp( - "unsqueeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3)); - EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5)); - EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1)); - EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10)); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(0.1)); - } - - ctx.Wait(); -} - -TEST(unsqueeze, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc deleted file mode 100644 index 4c1aa39168b69c944c5077b293402c00fe20c555..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unstack_op_npu.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class UnStackNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input("X"); - auto dx = ctx.MultiOutput("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int num = dy->dims()[axis]; - - auto stream = - ctx.template device_context() - .stream(); - - std::vector dx_list; - for (int i = 0; i < num; i++) { - dx[i]->mutable_data(ctx.GetPlace()); - dx_list.push_back(*dx[i]); - } - - const auto &runner = - NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); - runner.Run(stream); - } -}; - -template -class UnStackGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput(framework::GradVarName("Y")); - auto *y = ctx.Output(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - int num = static_cast(x.size()); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector x_list; - for (int i = 0; i < num; i++) { - x_list.push_back(*x[i]); - } - y->mutable_data(ctx.GetPlace()); - - const auto &runner = - NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - unstack, - ops::UnStackNPUKernel, - ops::UnStackNPUKernel); - -REGISTER_OP_NPU_KERNEL( - unstack_grad, - ops::UnStackGradNPUKernel, - ops::UnStackGradNPUKernel); diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc deleted file mode 100644 index b5c61e6b988aac74635836898058f34eec64d840..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/where_index_op_npu.cc +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class NPUWhereIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - auto* condition = context.Input("Condition"); - auto* out = context.Output("Out"); - - auto dims = condition->dims(); - const int rank = dims.size(); - - auto place = context.GetPlace(); - const aclrtStream& stream = dev_ctx.stream(); - - // Run Cast and ReduceSum to get 0 dim of Out - phi::DenseTensor booled_cond; - if (framework::TransToProtoVarType(condition->dtype()) != - framework::proto::VarType::BOOL) { - auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL); - booled_cond.mutable_data(dims, place); - const auto& booled_runner = - NpuOpRunner("Cast", - {*condition}, - {booled_cond}, - {{"dst_type", static_cast(bool_type)}}); - booled_runner.Run(stream); - } else { - booled_cond.ShareDataWith(*condition); - } - phi::DenseTensor casted_cond; - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64); - casted_cond.mutable_data(dims, place); - const auto& cast_runner = - NpuOpRunner("Cast", - {booled_cond}, - {casted_cond}, - {{"dst_type", static_cast(dst_dtype)}}); - cast_runner.Run(stream); - - phi::DenseTensor sumed_true_num; - sumed_true_num.mutable_data({1}, place); - phi::DenseTensor cond_axes; - cond_axes.mutable_data({dims.size()}, place); - std::vector axes_vec; - for (int i = 0; i < dims.size(); ++i) { - axes_vec.push_back(i); - } - framework::TensorFromVector(axes_vec, dev_ctx, &cond_axes); - const auto& sum_runner = NpuOpRunner("ReduceSum", - {casted_cond, cond_axes}, - {sumed_true_num}, - {{"keep_dims", false}}); - sum_runner.Run(stream); - - phi::DenseTensor local_true_num; - paddle::framework::TensorCopySync( - sumed_true_num, platform::CPUPlace(), &local_true_num); - auto true_num = *local_true_num.data(); - - out->Resize(phi::make_ddim({true_num, rank})); - out->mutable_data(place); - - if (true_num == 0) { - return; - } - - out->set_layout(DataLayout::kAnyLayout); - NpuOpRunner runner{"Where", {*condition}, {*out}}; - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(where_index, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel); diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc deleted file mode 100644 index e1af771f947bb5bb3f51d209a989861b63e383e9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/where_op_npu.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class WhereNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* condition = ctx.Input("Condition"); - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Select", {*condition, *X, *Y}, {*out}, {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class WhereGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* condition = ctx.Input("Condition"); - auto* dout_t = ctx.Input(framework::GradVarName("Out")); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - auto* dy_t = ctx.Output(framework::GradVarName("Y")); - - if (dx_t != nullptr) { - dx_t->mutable_data(ctx.GetPlace()); - } - if (dy_t != nullptr) { - dy_t->mutable_data(ctx.GetPlace()); - } - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_zeros(dout_t->dtype()); - tensor_zeros.mutable_data(dout_t->dims(), ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {}); - runner.Run(stream); - - if (dx_t != nullptr) { - const auto& runner = NpuOpRunner( - "Select", {*condition, *dout_t, tensor_zeros}, {*dx_t}, {}); - runner.Run(stream); - } - if (dy_t != nullptr) { - const auto& runner = NpuOpRunner( - "Select", {*condition, tensor_zeros, *dout_t}, {*dy_t}, {}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - where, - ops::WhereNPUKernel, - ops::WhereNPUKernel, - ops::WhereNPUKernel, - ops::WhereNPUKernel); - -REGISTER_OP_NPU_KERNEL( - where_grad, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel);