未验证 提交 5648bd80 编写于 作者: P pangyoki 提交者: GitHub

[NPU] Remove TensorFromVector and avoid sync copy in npu op kernel for better performance (#31994)

* enable async copy and  add wait before sync operation

* remove unneccessary wait

* add FillNpuTensorWithConstant

* refine

* fix fill_constant

* change TensorFromVector to FillNpuTensorWithConstant

* fix ignored api

* delete extra unittest

* fix little error

* fix update_loss_scaling_op_npu and check_finite_and_unscale_op_npu

* change TensorCopySync to TensorCopy

* delete useless Wait and add StreamWait

* fix npu_stream error

* fix check_finite_and_unscale_op_npu TensorCopy

* only save stream wait

* fix NPUDeviceContext in all c++ unittest

* delete wait
Co-authored-by: Nzhiqiu <chenqiuliang@baidu.com>
上级 5ad94e7b
...@@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> { ...@@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// 2.1 Get a factor tensor with shape [1]. // 2.1 Get a factor tensor with shape [1].
Tensor factor_tensor(framework::proto::VarType::FP32); Tensor factor_tensor(framework::proto::VarType::FP32);
factor_tensor.mutable_data<float>({1}, place); factor_tensor.mutable_data<float>({1}, place);
TensorFromVector(std::vector<float>{factor}, ctx.device_context(), FillNpuTensorWithConstant<float>(&factor_tensor, factor);
&factor_tensor);
// 2.2 Get the factor which has the shape with x and the same value with // 2.2 Get the factor which has the shape with x and the same value with
// factor. // factor.
......
...@@ -44,10 +44,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -44,10 +44,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// step1: inverse scale(RealDiv) // step1: inverse scale(RealDiv)
Tensor const_tensor; Tensor const_tensor;
const_tensor.mutable_data<T>({1}, ctx.GetPlace()); const_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, ctx.device_context(), FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
&const_tensor);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
// Inverse(1.0/scale) // Inverse(1.0/scale)
Tensor* tmp_inverse_out = const_cast<Tensor*>(scale); Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
...@@ -105,7 +102,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -105,7 +102,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
bool* is_found_inf = bool* is_found_inf =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace()); found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
*is_found_inf = true; *is_found_inf = true;
framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf);
framework::TensorCopy(
found_inf_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
} }
} }
}; };
......
...@@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx,
// bad_out_data = bad_in_data + 1 // bad_out_data = bad_in_data + 1
Tensor factor_tensor(bad_out_tensor->type()); Tensor factor_tensor(bad_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {}); {*bad_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
...@@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx,
// good_out_data = good_in_data + 1 // good_out_data = good_in_data + 1
Tensor factor_tensor(good_out_tensor->type()); Tensor factor_tensor(good_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {}); {*good_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
......
...@@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
ctx.template device_context<paddle::platform::NPUDeviceContext>() framework::TensorCopy(
.Wait(); *tmp_dout, ctx.GetPlace(),
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); ctx.template device_context<platform::DeviceContext>(), dx);
} }
} }
...@@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", false}}); {{"axes", axes}, {"keep_dims", false}});
runner.Run(stream); runner.Run(stream);
tmp_dout = &reduced_dout; tmp_dout = &reduced_dout;
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
} }
// stage 2 // stage 2
...@@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
ctx.template device_context<paddle::platform::NPUDeviceContext>() framework::TensorCopy(
.Wait(); *tmp_dout, ctx.GetPlace(),
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy); ctx.template device_context<platform::DeviceContext>(), dy);
} }
} }
} }
......
...@@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); framework::TensorCopy(
*tmp_dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
} }
} }
if (dy) { if (dy) {
......
...@@ -12,10 +12,9 @@ ...@@ -12,10 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/increment_op.h" #include "paddle/fluid/operators/increment_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -30,7 +29,6 @@ class OpBase; ...@@ -30,7 +29,6 @@ class OpBase;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class IncrementalNPUKernel : public framework::OpKernel<T> { class IncrementalNPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -41,21 +39,15 @@ class IncrementalNPUKernel : public framework::OpKernel<T> { ...@@ -41,21 +39,15 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
out_tensor->mutable_data<T>(context.GetPlace()); out_tensor->mutable_data<T>(context.GetPlace());
Tensor step_tensor(x_tensor->type()); Tensor step_tensor(x_tensor->type());
std::vector<T> step_vec; step_tensor.mutable_data<T>({1}, context.GetPlace());
step_vec.push_back(static_cast<T>(step)); FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
framework::TensorFromVector(
step_vec,
context.device_context(),
&step_tensor);
auto runner = NpuOpRunner("Add", auto runner =
{*x_tensor, step_tensor}, NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
{*out_tensor},
{});
auto stream = auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>() context.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -63,7 +55,6 @@ class IncrementalNPUKernel : public framework::OpKernel<T> { ...@@ -63,7 +55,6 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -73,5 +64,5 @@ REGISTER_OP_NPU_KERNEL( ...@@ -73,5 +64,5 @@ REGISTER_OP_NPU_KERNEL(
ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>, ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>, ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>, ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>) ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext,
plat::float16>)
...@@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
default_scale.mutable_data<T>(framework::make_ddim(axes), place); default_scale.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
ctx.device_context(), &value);
auto runner = auto runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
...@@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
default_bias.mutable_data<T>(framework::make_ddim(axes), place); default_bias.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(0)}, ctx.device_context(), FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
&value);
auto runner = auto runner =
NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
...@@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
default_scale.mutable_data<T>(framework::make_ddim(axes), place); default_scale.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
ctx.device_context(), &value);
auto runner = auto runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <cmath>
#include <iostream>
#include <numeric>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(lookup_table_v2);
USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto ids = scope->Var("Ids");
auto out = scope->Var("Out");
auto w = scope->Var("W");
auto ids_t = ids->GetMutable<f::LoDTensor>();
auto out_t = out->GetMutable<f::LoDTensor>();
auto w_t = w->GetMutable<f::LoDTensor>();
int bsz = 10;
int dim = 32;
int seqlen = 8;
int vocab_size = 100;
TensorFromVector(std::vector<int64_t>(bsz * seqlen, 3), ctx, ids_t);
std::vector<T> val(vocab_size * dim, 10.);
TensorFromVector(val, ctx, w_t);
ids_t->Resize({bsz, seqlen});
w_t->Resize({vocab_size, dim});
out_t->Resize({bsz, seqlen, dim});
ctx.Wait();
auto place = ctx.GetPlace();
out_t->mutable_data<T>(place);
f::AttributeMap attrs = {{}};
auto op = f::OpRegistry::CreateOp("lookup_table_v2",
{{"W", {"W"}}, {"Ids", {"Ids"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<T> out_v;
TensorToVector(*out_t, ctx, &out_v);
ctx.Wait();
EXPECT_EQ(out_t->numel(), bsz * seqlen * dim);
T res = std::accumulate(out_v.begin(), out_v.end(), 0.);
float eps = 1.e-6;
EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps);
}
template <typename T>
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto w = scope->Var("W");
auto ids = scope->Var("Ids");
auto out = scope->Var("DOut");
auto dw = scope->Var("DW");
auto w_t = w->GetMutable<f::LoDTensor>();
auto ids_t = ids->GetMutable<f::LoDTensor>();
auto out_t = out->GetMutable<f::LoDTensor>();
auto dw_t = dw->GetMutable<f::LoDTensor>();
int bsz = 2;
int dim = 2;
int seqlen = 2;
int vocab_size = 4;
std::vector<int64_t> val_int(bsz * seqlen, 3);
std::vector<T> val(vocab_size * dim, 0.);
std::vector<T> val_out(bsz * seqlen * dim, 1.);
TensorFromVector(val_int, ctx, ids_t);
TensorFromVector(val, ctx, w_t);
TensorFromVector(val, ctx, dw_t);
TensorFromVector(val_out, ctx, out_t);
w_t->Resize({vocab_size, dim});
ids_t->Resize({bsz, seqlen});
out_t->Resize({bsz, seqlen, dim});
dw_t->Resize({vocab_size, dim});
ctx.Wait();
auto place = ctx.GetPlace();
out_t->mutable_data<T>(place);
w_t->mutable_data<T>(place);
dw_t->mutable_data<T>(place);
f::AttributeMap attrs = {{}};
auto op = f::OpRegistry::CreateOp(
"lookup_table_v2_grad",
{{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}},
{{"W@GRAD", {"DW"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
std::vector<T> w_v;
TensorToVector(*dw_t, ctx, &w_v);
ctx.Wait();
EXPECT_EQ(dw_t->numel(), vocab_size * dim);
T res = std::accumulate(w_v.begin(), w_v.end(), 0.);
float eps = 1.e-6;
EXPECT_LT(fabs(res - bsz * seqlen * dim), eps);
}
TEST(lookup_table_v2, NPU_fp32) {
f::Scope scope;
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
TEST(lookup_table_v2_grad, NPU_fp32) {
f::Scope scope;
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx);
}
...@@ -10,9 +10,8 @@ See the License for the specific language governing permissions and ...@@ -10,9 +10,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,34 +25,27 @@ class MeanNPUKernel : public framework::OpKernel<T> { ...@@ -26,34 +25,27 @@ class MeanNPUKernel : public framework::OpKernel<T> {
std::vector<int> axes; std::vector<int> axes;
framework::NPUAttributeMap attr_input = { framework::NPUAttributeMap attr_input = {{"keep_dims", false},
{"keep_dims", false}, {"axes", axes}};
{"axes", axes}};
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceMeanD", auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
{*x},
{*out},
attr_input);
auto stream = auto stream =
ctx.template device_context< ctx.template device_context<paddle::platform::NPUDeviceContext>()
paddle::platform::NPUDeviceContext>() .stream();
.stream();
runner.Run(stream); runner.Run(stream);
} }
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class MeanGradNPUKernel : public framework::OpKernel<T> { class MeanGradNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto stream = auto stream =
context.template device_context< context.template device_context<paddle::platform::NPUDeviceContext>()
paddle::platform::NPUDeviceContext>() .stream();
.stream();
auto grad = context.Input<Tensor>(framework::GradVarName("Out")); auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
...@@ -76,11 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel<T> { ...@@ -76,11 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
Tensor mean_tensor(grad->type()); Tensor mean_tensor(grad->type());
mean_tensor.Resize({1}); mean_tensor.Resize({1});
mean_tensor.mutable_data<T>(context.GetPlace()); mean_tensor.mutable_data<T>(context.GetPlace());
std::vector<float> mean_vec; FillNpuTensorWithConstant<T>(
mean_vec.push_back(1.0/static_cast<float>(IG->numel())); &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
framework::TensorFromVector(mean_vec,
context.device_context(),
&mean_tensor);
// means mul ones // means mul ones
Tensor mean_ma(grad->type()); Tensor mean_ma(grad->type());
...@@ -95,23 +84,19 @@ class MeanGradNPUKernel : public framework::OpKernel<T> { ...@@ -95,23 +84,19 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(
mean, mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>) ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(
mean_grad, mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>) ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
...@@ -61,23 +61,17 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -61,23 +61,17 @@ class AdamNPUKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace()); mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace()); mom2_out->mutable_data<T>(ctx.GetPlace());
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place. // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
if (beta1_pow->place() == platform::CPUPlace()) { if (beta1_pow->place() == platform::CPUPlace()) {
float beta1 = *beta1_pow->data<float>(); T beta1 = *beta1_pow->data<T>();
beta1_pow_out->mutable_data<T>(ctx.GetPlace()); FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
TensorFromVector(std::vector<float>{beta1}, ctx.device_context(),
beta1_pow_out);
} else {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
} }
if (beta2_pow->place() == platform::CPUPlace()) { if (beta2_pow->place() == platform::CPUPlace()) {
float beta2 = *beta2_pow->data<float>(); T beta2 = *beta2_pow->data<T>();
beta2_pow_out->mutable_data<T>(ctx.GetPlace()); FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
TensorFromVector(std::vector<float>{beta2}, ctx.device_context(),
beta2_pow_out);
} else {
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
} }
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
...@@ -116,18 +110,15 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -116,18 +110,15 @@ class AdamNPUKernel : public framework::OpKernel<T> {
// reshape // reshape
Tensor beta1_tensor(framework::proto::VarType::FP32); Tensor beta1_tensor(framework::proto::VarType::FP32);
beta1_tensor.mutable_data<float>({1}, ctx.GetPlace()); beta1_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{beta1}, ctx.device_context(), FillNpuTensorWithConstant<T>(&beta1_tensor, beta1);
&beta1_tensor);
Tensor beta2_tensor(framework::proto::VarType::FP32); Tensor beta2_tensor(framework::proto::VarType::FP32);
beta2_tensor.mutable_data<float>({1}, ctx.GetPlace()); beta2_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{beta2}, ctx.device_context(), FillNpuTensorWithConstant<T>(&beta2_tensor, beta2);
&beta2_tensor);
Tensor epsilon_tensor(framework::proto::VarType::FP32); Tensor epsilon_tensor(framework::proto::VarType::FP32);
epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace()); epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{epsilon}, ctx.device_context(), FillNpuTensorWithConstant<T>(&epsilon_tensor, epsilon);
&epsilon_tensor);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -146,16 +137,19 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -146,16 +137,19 @@ class AdamNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): ApplyAdamD updates params inplace, so // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
// if param and param_out is not same, we need to do copy. // if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param->data<T>()) { if (param_out->data<T>() != param->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait(); framework::TensorCopy(
framework::TensorCopySync(*param, ctx.GetPlace(), param_out); *param, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
} }
if (mom1_out->data<T>() != mom1->data<T>()) { if (mom1_out->data<T>() != mom1->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait(); framework::TensorCopy(
framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out); *mom1, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom1_out);
} }
if (mom2_out->data<T>() != mom2->data<T>()) { if (mom2_out->data<T>() != mom2->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait(); framework::TensorCopy(
framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out); *mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
} }
auto runner_m1 = auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {}); NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {});
......
...@@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel<T> { ...@@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
// if param and param_out is not same, we need to do copy. // if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param_var->data<T>()) { if (param_out->data<T>() != param_var->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait(); framework::TensorCopy(
framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out); *param_var, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
} }
} }
}; };
......
...@@ -16,20 +16,19 @@ limitations under the License. */ ...@@ -16,20 +16,19 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/npu_op_runner.h"
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class RangeNPUKernel : public framework::OpKernel<T> { class RangeNPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -40,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel<T> { ...@@ -40,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel<T> {
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
framework::Tensor n; framework::Tensor n;
framework::TensorCopySync(*start_t, platform::CPUPlace(), &n); framework::TensorCopy(
*start_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T start = n.data<T>()[0]; T start = n.data<T>()[0];
framework::TensorCopySync(*end_t, platform::CPUPlace(), &n); framework::TensorCopy(
*end_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T end = n.data<T>()[0]; T end = n.data<T>()[0];
framework::TensorCopySync(*step_t, platform::CPUPlace(), &n); framework::TensorCopy(
*step_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T step = n.data<T>()[0]; T step = n.data<T>()[0];
int64_t size = 0; int64_t size = 0;
...@@ -70,8 +81,7 @@ class RangeNPUKernel : public framework::OpKernel<T> { ...@@ -70,8 +81,7 @@ class RangeNPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(
range, range, ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, float>, ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, double>) ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, double>)
......
...@@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// on and off // on and off
Tensor on_tensor(framework::proto::VarType::INT32); Tensor on_tensor(framework::proto::VarType::INT32);
on_tensor.mutable_data<int>({1}, ctx.GetPlace()); on_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(1)}, FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
ctx.device_context(), &on_tensor);
Tensor off_tensor(framework::proto::VarType::INT32); Tensor off_tensor(framework::proto::VarType::INT32);
off_tensor.mutable_data<int>({1}, ctx.GetPlace()); off_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(0)}, FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
ctx.device_context(), &off_tensor);
// one_hot // one_hot
Tensor tmp_onehot(on_tensor.type()); Tensor tmp_onehot(on_tensor.type());
...@@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> { ...@@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
// on and off // on and off
Tensor on_tensor(framework::proto::VarType::INT32); Tensor on_tensor(framework::proto::VarType::INT32);
on_tensor.mutable_data<int>({1}, ctx.GetPlace()); on_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(1)}, FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
ctx.device_context(), &on_tensor);
Tensor off_tensor(framework::proto::VarType::INT32); Tensor off_tensor(framework::proto::VarType::INT32);
off_tensor.mutable_data<int>({1}, ctx.GetPlace()); off_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(0)}, FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
ctx.device_context(), &off_tensor);
// one_hot // one_hot
Tensor tmp_onehot(on_tensor.type()); Tensor tmp_onehot(on_tensor.type());
......
...@@ -12,14 +12,14 @@ limitations under the License. */ ...@@ -12,14 +12,14 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
#include "paddle/fluid/operators/top_k_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void gen_assist_seq(framework::Tensor* assit_tensor, void gen_assist_seq(framework::Tensor* assit_tensor, int64_t dim,
int64_t dim, const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
const int64_t dimx2 = dim; const int64_t dimx2 = dim;
std::vector<paddle::platform::float16> assit; std::vector<paddle::platform::float16> assit;
assit.resize(2 * dimx2); assit.resize(2 * dimx2);
...@@ -28,15 +28,14 @@ void gen_assist_seq(framework::Tensor* assit_tensor, ...@@ -28,15 +28,14 @@ void gen_assist_seq(framework::Tensor* assit_tensor,
assit[i] = static_cast<paddle::platform::float16>(i); assit[i] = static_cast<paddle::platform::float16>(i);
// for i in range [dim, dimx2] // for i in range [dim, dimx2]
int64_t idx = static_cast<int64_t>( int64_t idx =
static_cast<paddle::platform::float16>(i)); static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
int64_t gap = i - idx; int64_t gap = i - idx;
assit[i + dim] = static_cast<paddle::platform::float16>(gap); assit[i + dim] = static_cast<paddle::platform::float16>(gap);
} }
framework::TensorFromVector(assit, ctx.device_context(), assit_tensor); framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class TopkNPUKernel : public framework::OpKernel<T> { class TopkNPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -64,10 +63,8 @@ class TopkNPUKernel : public framework::OpKernel<T> { ...@@ -64,10 +63,8 @@ class TopkNPUKernel : public framework::OpKernel<T> {
{"largest", true}}; {"largest", true}};
// run ascend // run ascend
auto runner = NpuOpRunner("TopKD", auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
{*input, assist_seq_tensor}, {*output, *indices}, attr_input);
{*output, *indices},
attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -83,7 +80,6 @@ class TopkNPUKernel : public framework::OpKernel<T> { ...@@ -83,7 +80,6 @@ class TopkNPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
// Ascend Op TopKD only support input float 16 dtype // Ascend Op TopKD only support input float 16 dtype
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(top_k,
top_k, ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
ops::TopkNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>);
paddle::platform::float16>);
...@@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> { ...@@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
float mean = ctx.Attr<float>("mean"); float mean = ctx.Attr<float>("mean");
Tensor mean_tensor(framework::proto::VarType::FP32); Tensor mean_tensor(framework::proto::VarType::FP32);
mean_tensor.mutable_data<float>({1}, ctx.GetPlace()); mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<float>{mean}, ctx.device_context(), FillNpuTensorWithConstant<float>(&mean_tensor, mean);
&mean_tensor);
float std = ctx.Attr<float>("std"); float std = ctx.Attr<float>("std");
Tensor std_tensor(framework::proto::VarType::FP32); Tensor std_tensor(framework::proto::VarType::FP32);
std_tensor.mutable_data<float>({1}, ctx.GetPlace()); std_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<float>{std}, ctx.device_context(), FillNpuTensorWithConstant<float>(&std_tensor, std);
&std_tensor);
int32_t seed_var = ctx.Attr<int32_t>("seed"); int32_t seed_var = ctx.Attr<int32_t>("seed");
Tensor min_tensor(framework::proto::VarType::FP32); Tensor min_tensor(framework::proto::VarType::FP32);
min_tensor.mutable_data<float>({1}, ctx.GetPlace()); min_tensor.mutable_data<float>({1}, ctx.GetPlace());
float min_value = mean - std * 2.0; float min_value = mean - std * 2.0;
TensorFromVector(std::vector<float>{min_value}, ctx.device_context(), FillNpuTensorWithConstant<float>(&min_tensor, min_value);
&min_tensor);
Tensor max_tensor(framework::proto::VarType::FP32); Tensor max_tensor(framework::proto::VarType::FP32);
max_tensor.mutable_data<float>({1}, ctx.GetPlace()); max_tensor.mutable_data<float>({1}, ctx.GetPlace());
float max_value = mean + std * 2.0; float max_value = mean + std * 2.0;
TensorFromVector(std::vector<float>{max_value}, ctx.device_context(), FillNpuTensorWithConstant<float>(&max_tensor, max_value);
&max_tensor);
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
......
...@@ -46,7 +46,6 @@ void NPUStream::Wait() const { ...@@ -46,7 +46,6 @@ void NPUStream::Wait() const {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_)); PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
} }
} // namespace stream } // namespace stream
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -26,7 +26,7 @@ from paddle.fluid import core ...@@ -26,7 +26,7 @@ from paddle.fluid import core
paddle.enable_static() paddle.enable_static()
SEED = 2021 SEED = 2021
NPUPlace = 5 NPUPlace = 0
@unittest.skipIf(not paddle.is_compiled_with_npu(), @unittest.skipIf(not paddle.is_compiled_with_npu(),
...@@ -38,7 +38,10 @@ class TestIncrement(OpTest): ...@@ -38,7 +38,10 @@ class TestIncrement(OpTest):
self.op_type = "increment" self.op_type = "increment"
self.init_dtype() self.init_dtype()
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)), } self.inputs = {
'X':
OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
}
self.attrs = {"Step": 1} self.attrs = {"Step": 1}
self.outputs = {'Out': np.array([2])} self.outputs = {'Out': np.array([2])}
...@@ -63,7 +66,10 @@ class TestIncrementFP16(OpTest): ...@@ -63,7 +66,10 @@ class TestIncrementFP16(OpTest):
self.op_type = "increment" self.op_type = "increment"
self.init_dtype() self.init_dtype()
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)), } self.inputs = {
'X':
OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
}
self.pre_input_id = id(self.inputs['X']) self.pre_input_id = id(self.inputs['X'])
self.attrs = {"Step": 1} self.attrs = {"Step": 1}
...@@ -100,10 +106,7 @@ class TestIncrementInplace(unittest.TestCase): ...@@ -100,10 +106,7 @@ class TestIncrementInplace(unittest.TestCase):
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
b_value = exe.run( b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b])
main_prog,
feed={"a": a_np,},
fetch_list=[b])
print('input a id is : {}'.format(id(a))) print('input a id is : {}'.format(id(a)))
print('input b id is : {}'.format(id(b))) print('input b id is : {}'.format(id(b)))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册