Revert "[NPU] add npu kernel for mean Op (#31562)"

This reverts commit 468ac699.

Revert "[NPU] add npu kernel for mean Op (#31562)"
This reverts commit 468ac699.
463617d7 · Leo Chen · GitHub · 468ac699 · 463617d7 · 468ac699
4 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,6 +184,4 @@ endif()
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
-cc_test(mean_op_npu_test SRCS mean_op_npu_test.cc DEPS op_registry mean_op scope device_context enforce executor)
 endif()
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/operators/npu_op_runner.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class MeanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto reduce_ndim = x->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-    framework::NPUAttributeMap attr_input = {
-                  {"keep_dims", false},
-                  {"axes", axes}};
-    std::vector<int64_t> out_dims;
-    out_dims.push_back(1);
-    out->Resize(framework::make_ddim(out_dims));
-    out->mutable_data<T>(ctx.GetPlace());
-    Tensor reduced_out(x->type());
-    std::vector<int64_t> reduced_dout_dims;
-    reduced_dout_dims.push_back(1);
-    reduced_out.Resize(framework::make_ddim(reduced_dout_dims));
-    reduced_out.mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("ReduceMeanD",
-                              {*x},
-                              {*out},
-                              attr_input);
-    auto stream =
-      ctx.template device_context<
-                     paddle::platform::NPUDeviceContext>()
-                .stream();
-    runner.Run(stream);
-  }
-};
-template <typename DeviceContext, typename T>
-class MeanGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto stream =
-      context.template device_context<
-                          paddle::platform::NPUDeviceContext>()
-                          .stream();
-    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(grad->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "Mean Gradient Input Tensor len should be 1. But "
-                          "received Out@Grad's elements num is %d.",
-                          grad->numel()));
-    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
-    IG->mutable_data<T>(context.GetPlace());
-    // ones
-    Tensor ones(grad->type());
-    std::vector<int64_t> dout_dims;
-    for (auto i = 0; i < IG->dims().size(); ++i) {
-      dout_dims.push_back(IG->dims()[i]);
-    }
-    ones.Resize(framework::make_ddim(dout_dims));
-    ones.mutable_data<T>(context.GetPlace());
-    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
-    runner_ones.Run(stream);
-    // means
-    Tensor mean_tensor(grad->type());
-    mean_tensor.Resize({1});
-    mean_tensor.mutable_data<T>(context.GetPlace());
-    std::vector<float> mean_vec;
-    mean_vec.push_back(1.0/static_cast<float>(IG->numel()));
-    framework::TensorFromVector(mean_vec,
-                                context.device_context(),
-                                &mean_tensor);
-    // means mul ones
-    Tensor mean_ma(grad->type());
-    mean_ma.Resize(framework::make_ddim(dout_dims));
-    mean_ma.mutable_data<T>(context.GetPlace());
-    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
-    runner_mul_1.Run(stream);
-    // and mul grad
-    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    mean,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
-REGISTER_OP_NPU_KERNEL(
-    mean_grad,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
--- a/paddle/fluid/operators/mean_op_npu_test.cc
+++ b/paddle/fluid/operators/mean_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-USE_OP(mean);
-USE_OP_DEVICE_KERNEL(mean, NPU);
-USE_OP(mean_grad);
-USE_OP_DEVICE_KERNEL(mean_grad, NPU);
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-  init.push_back(static_cast<T>(2.0));
-  init.push_back(static_cast<T>(3.0));
-  init.push_back(static_cast<T>(4.0));
-  TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({4});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  auto op = f::OpRegistry::CreateOp(op_type,
-                           {{"X", {"X"}}},
-                           {{"Out", {"Out"}}},
-                           {});
-  op->Run(*scope, place);
-  std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
-  EXPECT_EQ((float)out_vec[0], (float)2.5);
-}
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
-                 std::string op_type) {
-  // init
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
-  float dvalue = 2.0;
-  tensor_dout->Resize({1});
-  std::vector<T> init_dout;
-  init_dout.push_back(static_cast<T>(dvalue));
-  TensorFromVector(init_dout, ctx, tensor_dout);
-  ctx.Wait();
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-  tensor_x->Resize({4});
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
-  tensor_dx->Resize({4});
-  ctx.Wait();
-  auto op = f::OpRegistry::CreateOp(op_type,
-                                    {{"Out@GRAD", {"DOut"}},
-                                     {"X", {"X"}}},
-                                    {{"X@GRAD", {"DX"}}},
-                                    {});
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-  std::vector<float> out_vec;
-  TensorToVector(*tensor_dx, ctx, &out_vec);
-  ctx.Wait();
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
-  EXPECT_EQ((float)out_vec[0], (float)1.0/dvalue);
-  EXPECT_EQ((float)out_vec[1], (float)1.0/dvalue);
-  EXPECT_EQ((float)out_vec[2], (float)1.0/dvalue);
-  EXPECT_EQ((float)out_vec[3], (float)1.0/dvalue);
-}
-TEST(mean, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "mean");
-}
-TEST(mean_grad, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx, "mean_grad");
-}
--- a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import numpy as np
-import unittest
-import sys
-sys.path.append("..")
-from op_test import OpTest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-paddle.enable_static()
-SEED = 2021
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
-class TestMean(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.place = paddle.NPUPlace(0)
-        self.op_type = "mean"
-        self.init_dtype()
-        x = np.random.random([3, 3]).astype(self.dtype)
-        self.inputs = {'X': x}
-        self.attrs = {}
-        np_out = np.mean(x)
-        self.outputs = {'Out': np_out}
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
-    def init_dtype(self):
-        self.dtype = np.float32
-    def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
-class TestMeanFP16(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.place = paddle.NPUPlace(0)
-        self.op_type = "mean"
-        self.init_dtype()
-        x = np.random.random([3, 3]).astype(self.dtype)
-        self.inputs = {'X': x}
-        self.attrs = {}
-        np_out = np.mean(x)
-        self.outputs = {'Out': np_out}
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
-    def init_dtype(self):
-        self.dtype = np.float16
-    def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
-class TestMeanNet(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-        a_np = np.random.random(size=(32, 32)).astype('float32')
-        b_np = np.random.random(size=(32, 32)).astype('float32')
-        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
-            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
-            c = paddle.multiply(a, b)
-            d = paddle.sqrt(c)
-            fc_1 = fluid.layers.fc(input=d, size=128)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='sigmoid')
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-        if run_npu:
-            place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-        return pred_res, loss_res
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
-if __name__ == '__main__':
-    unittest.main()