From 10271ddfc43e356a28fba67272c653383a8bdc32 Mon Sep 17 00:00:00 2001 From: TTerror Date: Wed, 20 Jan 2021 20:16:43 +0800 Subject: [PATCH] support reduce_max op on kunlun (#30581) * support reduce_max op on kunlun * support reduce_max op on kunlun * support reduce_max op on kunlun * support reduce_max op on kunlun --- .../operators/reduce_ops/reduce_max_op_xpu.cc | 149 ++++++++++++++++++ .../operators/reduce_ops/reduce_op_xpu.h | 100 ++++++++++++ .../operators/reduce_ops/reduce_sum_op_xpu.cc | 68 +------- .../unittests/xpu/test_reduce_max_op_xpu.py | 74 +++++++++ 4 files changed, 325 insertions(+), 66 deletions(-) create mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_op_xpu.h create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc new file mode 100644 index 0000000000..a4ed0c85f4 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_XPU +#include +#include +#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +template +class ReduceMaxXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + XPUReduce(context, xpu::reduce_max); + } +}; + +template +class ReduceMaxGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto dims = context.Attr>("dim"); + bool reduce_all = context.Attr("reduce_all"); + auto* x = context.Input("X"); + auto* out = context.Input("Out"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* x_grad = context.Output(framework::GradVarName("X")); + + int in_dtype = context.Attr("in_dtype"); + PADDLE_ENFORCE_EQ( + in_dtype == -1, true, + platform::errors::InvalidArgument( + "XPU only support in_dtype == -1 in reduce_sum_grad op.")); + + auto& dev_ctx = context.template device_context(); + x_grad->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + const T* out_data = out->data(); + const T* out_grad_data = out_grad->data(); + auto* x_grad_data = x_grad->data(); + const auto& input_dim_size = x->dims().size(); + std::vector true_dims; + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + true_dims.push_back(dims[i] + input_dim_size); + } else { + true_dims.push_back(dims[i]); + } + } + std::vector ydims(input_dim_size); + std::vector xdims((input_dim_size)); + std::set dims_set(true_dims.begin(), true_dims.end()); + for (auto i = 0; i < input_dim_size; i++) { + xdims[i] = x->dims()[i]; + if (dims_set.find(i) != dims_set.end() || reduce_all) { + ydims[i] = 1; + } else { + ydims[i] = x->dims()[i]; + } + } + + T* brocast1 = nullptr; + T* brocast2 = nullptr; + bool* equal = nullptr; + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&brocast1), x->numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&equal), x->numel() * sizeof(bool)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&brocast2), x->numel() * sizeof(T)), + XPU_SUCCESS, + platform::errors::ResourceExhausted("XPU has no enough memory")); + + // step 1. brocast out and out_grad + int r = xpu::broadcast(dev_ctx.x_context(), out_data, brocast1, ydims, + xdims); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU broadcast in reduce_max_grad op return" + " wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + r = xpu::broadcast(dev_ctx.x_context(), out_grad_data, brocast2, ydims, + xdims); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU broadcast in reduce_max_grad op return" + " wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + // step 2. comparse out_brocast and x + r = xpu::elementwise_equal(dev_ctx.x_context(), x_data, brocast1, equal, + x->numel()); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU elementwise_equal in reduce_max_grad " + "op return wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + // step 3. get x_grad + r = xpu::constant(dev_ctx.x_context(), brocast1, x->numel(), 0); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU constant in reduce_max_grad op return" + " wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + r = xpu::select(dev_ctx.x_context(), equal, brocast2, brocast1, + x_grad_data, xdims, xdims); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU select in reduce_max_grad op return" + " wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + xpu_free(brocast1); + xpu_free(brocast2); + xpu_free(equal); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_XPU_KERNEL( + reduce_max, + ops::ReduceMaxXPUKernel); +REGISTER_OP_XPU_KERNEL( + reduce_max_grad, + ops::ReduceMaxGradXPUKernel); + +#endif diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h new file mode 100644 index 0000000000..fa9503ec3f --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h @@ -0,0 +1,100 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include +#include +#include +#include +#include +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +template +void XPUReduce( + const framework::ExecutionContext& context, + std::function&, + const std::vector&)> + func) { + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(context.GetPlace()), true, + platform::errors::Unavailable("This kernel only runs on XPU.")); + bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + y->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + int out_dtype = context.Attr("out_dtype"); + PADDLE_ENFORCE_EQ(out_dtype == -1, true, + platform::errors::InvalidArgument( + "XPU only support out_dtype == -1 in reduce op.")); + + const auto* x_data = x->data(); + auto* y_data = y->data(); + const auto& input_dim_size = x->dims().size(); + std::vector true_dims; + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + true_dims.push_back(dims[i] + input_dim_size); + } else { + true_dims.push_back(dims[i]); + } + } + + std::vector reduce_dims; + std::vector xdims((input_dim_size)); + for (int i = 0; i < input_dim_size; ++i) { + xdims[i] = x->dims()[i]; + } + if (reduce_all) { + for (int i = 0; i < input_dim_size; ++i) { + reduce_dims.push_back(i); + } + } else { + std::set dims_set(true_dims.begin(), true_dims.end()); + for (auto i = 0; i < input_dim_size; i++) { + if (dims_set.find(i) != dims_set.end()) { + if (x->dims()[i] != 1) { + reduce_dims.push_back(i); + } + } + } + } + + if (reduce_dims.size() == 0) { + int r = xpu::copy(dev_ctx.x_context(), x_data, y_data, + x->numel() * sizeof(T)); + PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU copy in reduce op return " + "wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + } else { + int r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims); + PADDLE_ENFORCE_EQ( + r == xpu::Error_t::SUCCESS, true, + platform::errors::External("XPU reduce op return wrong value[%d %s].", + r, XPUAPIErrorMsg[r])); + } +} + +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc index f67d43194a..bf55221bd3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc @@ -13,9 +13,9 @@ // limitations under the License. #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" #include #include +#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h" #include "paddle/fluid/platform/xpu_header.h" namespace paddle { @@ -25,71 +25,7 @@ template class ReduceSumXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_xpu_place(context.GetPlace()), true, - platform::errors::Unavailable("This kernel only runs on XPU.")); - bool reduce_all = context.Attr("reduce_all"); - auto dims = context.Attr>("dim"); - auto* x = context.Input("X"); - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - - int out_dtype = context.Attr("out_dtype"); - PADDLE_ENFORCE_EQ( - out_dtype == -1, true, - platform::errors::InvalidArgument( - "XPU only support out_dtype == -1 in reduce_sum op.")); - - const auto* x_data = x->data(); - auto* y_data = y->data(); - const auto& input_dim_size = x->dims().size(); - std::vector true_dims; - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) { - true_dims.push_back(dims[i] + input_dim_size); - } else { - true_dims.push_back(dims[i]); - } - } - - std::vector reduce_dims; - std::vector xdims((input_dim_size)); - for (int i = 0; i < input_dim_size; ++i) { - xdims[i] = x->dims()[i]; - } - if (reduce_all) { - for (int i = 0; i < input_dim_size; ++i) { - reduce_dims.push_back(i); - } - } else { - std::set dims_set(true_dims.begin(), true_dims.end()); - for (auto i = 0; i < input_dim_size; i++) { - if (dims_set.find(i) != dims_set.end()) { - if (x->dims()[i] != 1) { - reduce_dims.push_back(i); - } - } - } - } - - if (reduce_dims.size() == 0) { - int r = xpu::copy(dev_ctx.x_context(), x_data, y_data, - x->numel() * sizeof(T)); - PADDLE_ENFORCE_EQ( - r == xpu::Error_t::SUCCESS, true, - platform::errors::External("XPU copy in reduce_sum op return " - "wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - } else { - int r = xpu::reduce_sum(dev_ctx.x_context(), x_data, y_data, xdims, - reduce_dims); - PADDLE_ENFORCE_EQ( - r == xpu::Error_t::SUCCESS, true, - platform::errors::External("XPU reduce_sum in reduce_sum op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - } + XPUReduce(context, xpu::reduce_sum); } }; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py new file mode 100644 index 0000000000..55ed5442cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test_xpu import OpTest, XPUOpTest +from op_test import skip_check_grad_ci +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +class TestXPUReduceMaxOp(XPUOpTest): + def setUp(self): + self.init_op_type() + self.initTestCase() + self.use_xpu = True + self.use_mkldnn = False + self.attrs = { + 'dim': self.axis, + 'keep_dim': self.keep_dim, + 'reduce_all': self.reduce_all + } + self.inputs = {'X': np.random.random(self.shape).astype("float32")} + if self.attrs['reduce_all']: + self.outputs = {'Out': self.inputs['X'].max()} + else: + self.outputs = { + 'Out': self.inputs['X'].max(axis=self.axis, + keepdims=self.attrs['keep_dim']) + } + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + paddle.enable_static() + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + paddle.enable_static() + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + def init_op_type(self): + self.op_type = "reduce_max" + self.use_mkldnn = False + self.keep_dim = False + self.reduce_all = False + + def initTestCase(self): + self.shape = (5, 6, 10) + self.axis = (-1, ) + + +if __name__ == '__main__': + unittest.main() -- GitLab