未验证 提交 cc6d2b07 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] update batch norm op, test=develop (#35223)

* [NPU] update batch norm op, test=develop

* add NHWC support for bn, test=develop
上级 d47a97db
...@@ -11,25 +11,30 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,25 +11,30 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T> template <typename T>
class NPUBatchNormOpKernel : public framework::OpKernel<T> { class NPUBatchNormOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
float momentum = ctx.Attr<float>("momentum"); float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats"); const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics"); const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
const bool test_mode = is_test && (!trainable_stats);
const std::string data_layout = ctx.Attr<std::string>("data_layout"); bool test_mode = is_test && (!trainable_stats);
bool training = !test_mode && !use_global_stats;
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
...@@ -38,48 +43,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> { ...@@ -38,48 +43,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
"The input tensor X's dimension must equal to 4. But " "The input tensor X's dimension must equal to 4. But "
"received X's shape = [%s], X's dimension = [%d].", "received X's shape = [%s], X's dimension = [%d].",
x_dims, x_dims.size())); x_dims, x_dims.size()));
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y"); auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
const auto *scale = ctx.Input<Tensor>("Scale"); Tensor x_tensor(x->type());
const auto *bias = ctx.Input<Tensor>("Bias"); Tensor y_tesnor(y->type());
Tensor x_tensor, y_tesnor;
x_tensor.ShareDataWith(*x); x_tensor.ShareDataWith(*x);
y_tesnor.ShareDataWith(*y); y_tesnor.ShareDataWith(*y);
if (data_layout == "NHWC") { if (data_layout == DataLayout::kNHWC) {
x_tensor.set_layout(DataLayout::kNHWC); x_tensor.set_layout(DataLayout::kNHWC);
y_tesnor.set_layout(DataLayout::kNHWC); y_tesnor.set_layout(DataLayout::kNHWC);
} }
bool training = !test_mode && !use_global_stats; auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (!training) { if (!training) {
const auto *est_mean = ctx.Input<Tensor>("Mean"); const auto &runner_infer = NpuOpRunner(
const auto *est_var = ctx.Input<Tensor>("Variance"); "BNInfer", {x_tensor, *scale, *bias, *running_mean, *running_var},
framework::Tensor reserve_space1, reserve_space2; {y_tesnor}, {{"epsilon", epsilon}});
reserve_space1.mutable_data<float>(est_mean->dims(), ctx.GetPlace()); runner_infer.Run(stream);
reserve_space2.mutable_data<float>(est_var->dims(), ctx.GetPlace());
const auto &runner = NpuOpRunner(
"BatchNorm", {x_tensor, *scale, *bias, *est_mean, *est_var},
{y_tesnor, reserve_space1, reserve_space2, reserve_space1,
reserve_space2},
{{"epsilon", epsilon},
{"is_training", training},
{"data_format", data_layout}});
auto stream = dev_ctx.stream();
runner.Run(stream);
} else { } else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
auto *mean_out = ctx.Output<Tensor>("MeanOut"); auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut"); auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean"); auto *saved_mean = ctx.Output<Tensor>("SavedMean");
...@@ -89,45 +76,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> { ...@@ -89,45 +76,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
saved_mean->mutable_data<T>(ctx.GetPlace()); saved_mean->mutable_data<T>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace()); saved_variance->mutable_data<T>(ctx.GetPlace());
framework::Tensor mean_tmp, variance_tmp; // if MomentumTensor is set, use MomentumTensor value, momentum
mean_tmp.mutable_data<float>(mean_out->dims(), ctx.GetPlace()); // is only used in this training branch
variance_tmp.mutable_data<float>(variance_out->dims(), ctx.GetPlace()); if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
const auto &runner = NpuOpRunner( Tensor mom_cpu;
"BatchNorm", {x_tensor, *scale, *bias}, TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
{y_tesnor, mean_tmp, variance_tmp, *saved_mean, *saved_variance}, momentum = mom_cpu.data<float>()[0];
{{"epsilon", epsilon}, }
{"is_training", training},
{"data_format", data_layout}}); framework::Tensor sum, square_sum;
auto stream = dev_ctx.stream(); sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
runner.Run(stream); square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
// Ascend can't output the estimated mean and variance
framework::Tensor this_factor_tensor; const auto &runner_reduce =
this_factor_tensor.mutable_data<float>(framework::make_ddim({1}), NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum},
ctx.GetPlace()); {{"epsilon", epsilon}});
framework::TensorFromVector<float>({static_cast<float>(1. - momentum)}, runner_reduce.Run(stream);
dev_ctx, &this_factor_tensor);
framework::Tensor momentum_tensor; const auto &runner_update = NpuOpRunner(
momentum_tensor.mutable_data<float>(framework::make_ddim({1}), "BNTrainingUpdate", {x_tensor, sum, square_sum, *scale, *bias,
ctx.GetPlace()); *running_mean, *running_var},
framework::TensorFromVector<float>({static_cast<float>(momentum)}, {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
dev_ctx, &momentum_tensor); {{"factor", momentum}, {"epsilon", epsilon}});
framework::Tensor ones_tensor; runner_update.Run(stream);
ones_tensor.mutable_data<float>(mean_out->dims(), ctx.GetPlace());
framework::TensorFromVector<float>(
std::vector<float>(framework::product(mean_out->dims()), 1.0f),
dev_ctx, &ones_tensor);
const auto &runner1 = NpuOpRunner("AddMatMatElements",
{*mean_out, *saved_mean, ones_tensor,
momentum_tensor, this_factor_tensor},
{*mean_out}, {});
runner1.Run(stream);
const auto &runner2 = NpuOpRunner(
"AddMatMatElements", {*variance_out, *saved_variance, ones_tensor,
momentum_tensor, this_factor_tensor},
{*variance_out}, {});
runner2.Run(stream);
} }
} }
}; };
...@@ -136,85 +108,82 @@ template <typename T> ...@@ -136,85 +108,82 @@ template <typename T>
class NPUBatchNormGradOpKernel : public framework::OpKernel<T> { class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>(); const auto *x = ctx.Input<Tensor>("X");
const float epsilon = ctx.Attr<float>("epsilon"); const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const std::string data_layout = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const auto *y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
auto *saved_variance = ctx.Input<Tensor>("SavedVariance"); // SavedVariance have been reverted in forward operator
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
const float epsilon = ctx.Attr<float>("epsilon");
DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X")); auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale")); auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const bool is_test = ctx.Attr<bool>("is_test");
use_global_stats = is_test || use_global_stats; use_global_stats = is_test || use_global_stats;
const Tensor *x = ctx.Input<Tensor>("X"); Tensor x_tensor(x->type());
const auto &x_dims = x->dims(); Tensor dy_tensor(d_y->type());
PADDLE_ENFORCE_EQ(x_dims.size(), 4, x_tensor.ShareDataWith(*x);
platform::errors::InvalidArgument( dy_tensor.ShareDataWith(*d_y);
"The input tensor X's dimension must equal to 4. But " if (data_layout == DataLayout::kNHWC) {
"received X's shape = [%s], X's dimension = [%d].", x_tensor.set_layout(DataLayout::kNHWC);
x_dims, x_dims.size())); dy_tensor.set_layout(DataLayout::kNHWC);
// init output
Tensor scale_grad_tmp, bias_grad_tmp, x_grad_tmp;
if (scale_grad && bias_grad) {
scale_grad->mutable_data<T>(ctx.GetPlace());
bias_grad->mutable_data<T>(ctx.GetPlace());
scale_grad_tmp.ShareDataWith(*scale_grad);
bias_grad_tmp.ShareDataWith(*bias_grad);
} else {
scale_grad_tmp.mutable_data<T>(scale->dims(), ctx.GetPlace());
bias_grad_tmp.mutable_data<T>(bias->dims(), ctx.GetPlace());
} }
Tensor x_tensor, y_grad_tensor, x_grad_tensor; Tensor scale_grad_tmp(scale->type());
x_tensor.ShareDataWith(*x); Tensor bias_grad_tmp(bias->type());
y_grad_tensor.ShareDataWith(*y_grad); if (d_scale == nullptr) {
if (x_grad) { scale_grad_tmp.Resize(scale->dims());
x_grad->mutable_data<T>(ctx.GetPlace()); d_scale = &scale_grad_tmp;
x_grad_tensor.ShareDataWith(*x_grad);
} else {
x_grad_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
} }
if (data_layout == "NHWC") { if (d_bias == nullptr) {
x_tensor.set_layout(DataLayout::kNHWC); bias_grad_tmp.Resize(bias->dims());
y_grad_tensor.set_layout(DataLayout::kNHWC); d_bias = &bias_grad_tmp;
x_grad_tensor.set_layout(DataLayout::kNHWC);
} }
if (!use_global_stats) {
const auto &runner = NpuOpRunner( auto stream = ctx.template device_context<NPUDeviceContext>().stream();
"BatchNormGrad", if (d_scale && d_bias) {
{y_grad_tensor, x_tensor, *scale, *saved_mean, *saved_variance}, d_scale->mutable_data<T>(ctx.GetPlace());
{x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *saved_mean, d_bias->mutable_data<T>(ctx.GetPlace());
*saved_variance}, // segment fault if no reserve_space_3 and if (use_global_stats) {
// reserve_space_4 const auto *running_mean = ctx.Input<Tensor>("Mean");
{{"epsilon", epsilon}, const auto *running_variance = ctx.Input<Tensor>("Variance");
{"is_training", true}, const auto &runner_update =
{"data_format", data_layout}}); NpuOpRunner("BNTrainingUpdateGrad",
auto stream = dev_ctx.stream(); {dy_tensor, x_tensor, *running_mean, *running_variance},
runner.Run(stream); {*d_scale, *d_bias}, {{"epsilon", epsilon}});
} else { runner_update.Run(stream);
const auto *running_mean = ctx.Input<Tensor>("Mean"); } else {
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto &runner_update =
NpuOpRunner("BNTrainingUpdateGrad",
const auto &runner = NpuOpRunner( {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance},
"BatchNormGrad", {*d_scale, *d_bias}, {{"epsilon", epsilon}});
{y_grad_tensor, x_tensor, *scale, *running_mean, *running_var}, runner_update.Run(stream);
{x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *running_mean, }
*running_var}, // segment fault if no reserve_space_3 and }
// reserve_space_4 if (d_x) {
{{"epsilon", epsilon}, d_x->mutable_data<T>(ctx.GetPlace());
{"is_training", true}, Tensor dx_tensor(d_x->type());
{"data_format", data_layout}}); dx_tensor.ShareDataWith(*d_x);
auto stream = dev_ctx.stream(); if (use_global_stats) {
runner.Run(stream); const auto *running_var = ctx.Input<Tensor>("Variance");
const auto &runner_infer =
NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var},
{dx_tensor}, {{"epsilon", epsilon}});
runner_infer.Run(stream);
} else {
const auto &runner_reduce = NpuOpRunner(
"BNTrainingReduceGrad", {dy_tensor, x_tensor, *d_scale, *d_bias,
*scale, *saved_mean, *saved_inv_variance},
{dx_tensor}, {{"epsilon", epsilon}});
runner_reduce.Run(stream);
}
} }
} }
}; };
......
...@@ -948,7 +948,11 @@ function assert_file_diff_approvals() { ...@@ -948,7 +948,11 @@ function assert_file_diff_approvals() {
function check_coverage() { function check_coverage() {
/bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh if [ ${WITH_COVERAGE:-ON} == "ON" ] ; then
/bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh
else
echo "WARNING: check_coverage need to compile with WITH_COVERAGE=ON, but got WITH_COVERAGE=OFF"
fi
} }
......
...@@ -421,7 +421,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): ...@@ -421,7 +421,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
x = np.random.randn(*shape).astype("float32") x = np.random.randn(*shape).astype("float32")
y1 = compute(x, False, False) y1 = compute(x, False, False)
y2 = compute(x, True, True) y2 = compute(x, True, True)
self.assertTrue(np.allclose(y1, y2)) self.assertTrue(np.allclose(y1, y2, atol=1e-5))
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册