未验证 提交 10271ddf 编写于 作者: T TTerror 提交者: GitHub

support reduce_max op on kunlun (#30581)

* support reduce_max op on kunlun

* support reduce_max op on kunlun

* support reduce_max op on kunlun

* support reduce_max op on kunlun
上级 5013c676
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_XPU
#include <memory>
#include <string>
#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
#include "paddle/fluid/platform/xpu_header.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ReduceMaxXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
XPUReduce<DeviceContext, T>(context, xpu::reduce_max<T>);
}
};
template <typename DeviceContext, typename T>
class ReduceMaxGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto dims = context.Attr<std::vector<int>>("dim");
bool reduce_all = context.Attr<bool>("reduce_all");
auto* x = context.Input<Tensor>("X");
auto* out = context.Input<Tensor>("Out");
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
int in_dtype = context.Attr<int>("in_dtype");
PADDLE_ENFORCE_EQ(
in_dtype == -1, true,
platform::errors::InvalidArgument(
"XPU only support in_dtype == -1 in reduce_sum_grad op."));
auto& dev_ctx = context.template device_context<DeviceContext>();
x_grad->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
const T* out_data = out->data<T>();
const T* out_grad_data = out_grad->data<T>();
auto* x_grad_data = x_grad->data<T>();
const auto& input_dim_size = x->dims().size();
std::vector<int> true_dims;
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
true_dims.push_back(dims[i] + input_dim_size);
} else {
true_dims.push_back(dims[i]);
}
}
std::vector<int> ydims(input_dim_size);
std::vector<int> xdims((input_dim_size));
std::set<int> dims_set(true_dims.begin(), true_dims.end());
for (auto i = 0; i < input_dim_size; i++) {
xdims[i] = x->dims()[i];
if (dims_set.find(i) != dims_set.end() || reduce_all) {
ydims[i] = 1;
} else {
ydims[i] = x->dims()[i];
}
}
T* brocast1 = nullptr;
T* brocast2 = nullptr;
bool* equal = nullptr;
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void**>(&brocast1), x->numel() * sizeof(T)),
XPU_SUCCESS,
platform::errors::ResourceExhausted("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void**>(&equal), x->numel() * sizeof(bool)),
XPU_SUCCESS,
platform::errors::ResourceExhausted("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void**>(&brocast2), x->numel() * sizeof(T)),
XPU_SUCCESS,
platform::errors::ResourceExhausted("XPU has no enough memory"));
// step 1. brocast out and out_grad
int r = xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims,
xdims);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU broadcast in reduce_max_grad op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
r = xpu::broadcast<T>(dev_ctx.x_context(), out_grad_data, brocast2, ydims,
xdims);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU broadcast in reduce_max_grad op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
// step 2. comparse out_brocast and x
r = xpu::elementwise_equal<T>(dev_ctx.x_context(), x_data, brocast1, equal,
x->numel());
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU elementwise_equal in reduce_max_grad "
"op return wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
// step 3. get x_grad
r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x->numel(), 0);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU constant in reduce_max_grad op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
r = xpu::select<T>(dev_ctx.x_context(), equal, brocast2, brocast1,
x_grad_data, xdims, xdims);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU select in reduce_max_grad op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
xpu_free(brocast1);
xpu_free(brocast2);
xpu_free(equal);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_XPU_KERNEL(
reduce_max,
ops::ReduceMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
reduce_max_grad,
ops::ReduceMaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_XPU
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/platform/xpu_header.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
void XPUReduce(
const framework::ExecutionContext& context,
std::function<int(xpu::Context*, const T*, T*, const std::vector<int>&,
const std::vector<int>&)>
func) {
PADDLE_ENFORCE_EQ(
platform::is_xpu_place(context.GetPlace()), true,
platform::errors::Unavailable("This kernel only runs on XPU."));
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
int out_dtype = context.Attr<int>("out_dtype");
PADDLE_ENFORCE_EQ(out_dtype == -1, true,
platform::errors::InvalidArgument(
"XPU only support out_dtype == -1 in reduce op."));
const auto* x_data = x->data<T>();
auto* y_data = y->data<T>();
const auto& input_dim_size = x->dims().size();
std::vector<int> true_dims;
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
true_dims.push_back(dims[i] + input_dim_size);
} else {
true_dims.push_back(dims[i]);
}
}
std::vector<int> reduce_dims;
std::vector<int> xdims((input_dim_size));
for (int i = 0; i < input_dim_size; ++i) {
xdims[i] = x->dims()[i];
}
if (reduce_all) {
for (int i = 0; i < input_dim_size; ++i) {
reduce_dims.push_back(i);
}
} else {
std::set<int> dims_set(true_dims.begin(), true_dims.end());
for (auto i = 0; i < input_dim_size; i++) {
if (dims_set.find(i) != dims_set.end()) {
if (x->dims()[i] != 1) {
reduce_dims.push_back(i);
}
}
}
}
if (reduce_dims.size() == 0) {
int r = xpu::copy<T>(dev_ctx.x_context(), x_data, y_data,
x->numel() * sizeof(T));
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU copy in reduce op return "
"wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
} else {
int r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU reduce op return wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
}
}
} // namespace operators
} // namespace paddle
#endif
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
#include "paddle/fluid/platform/xpu_header.h" #include "paddle/fluid/platform/xpu_header.h"
namespace paddle { namespace paddle {
...@@ -25,71 +25,7 @@ template <typename DeviceContext, typename T> ...@@ -25,71 +25,7 @@ template <typename DeviceContext, typename T>
class ReduceSumXPUKernel : public framework::OpKernel<T> { class ReduceSumXPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ( XPUReduce<DeviceContext, T>(context, xpu::reduce_sum<T>);
platform::is_xpu_place(context.GetPlace()), true,
platform::errors::Unavailable("This kernel only runs on XPU."));
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
int out_dtype = context.Attr<int>("out_dtype");
PADDLE_ENFORCE_EQ(
out_dtype == -1, true,
platform::errors::InvalidArgument(
"XPU only support out_dtype == -1 in reduce_sum op."));
const auto* x_data = x->data<T>();
auto* y_data = y->data<T>();
const auto& input_dim_size = x->dims().size();
std::vector<int> true_dims;
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
true_dims.push_back(dims[i] + input_dim_size);
} else {
true_dims.push_back(dims[i]);
}
}
std::vector<int> reduce_dims;
std::vector<int> xdims((input_dim_size));
for (int i = 0; i < input_dim_size; ++i) {
xdims[i] = x->dims()[i];
}
if (reduce_all) {
for (int i = 0; i < input_dim_size; ++i) {
reduce_dims.push_back(i);
}
} else {
std::set<int> dims_set(true_dims.begin(), true_dims.end());
for (auto i = 0; i < input_dim_size; i++) {
if (dims_set.find(i) != dims_set.end()) {
if (x->dims()[i] != 1) {
reduce_dims.push_back(i);
}
}
}
}
if (reduce_dims.size() == 0) {
int r = xpu::copy<T>(dev_ctx.x_context(), x_data, y_data,
x->numel() * sizeof(T));
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU copy in reduce_sum op return "
"wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
} else {
int r = xpu::reduce_sum<T>(dev_ctx.x_context(), x_data, y_data, xdims,
reduce_dims);
PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, true,
platform::errors::External("XPU reduce_sum in reduce_sum op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
}
} }
}; };
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import sys
sys.path.append("..")
from op_test_xpu import OpTest, XPUOpTest
from op_test import skip_check_grad_ci
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
from paddle.fluid.framework import convert_np_dtype_to_dtype_
class TestXPUReduceMaxOp(XPUOpTest):
def setUp(self):
self.init_op_type()
self.initTestCase()
self.use_xpu = True
self.use_mkldnn = False
self.attrs = {
'dim': self.axis,
'keep_dim': self.keep_dim,
'reduce_all': self.reduce_all
}
self.inputs = {'X': np.random.random(self.shape).astype("float32")}
if self.attrs['reduce_all']:
self.outputs = {'Out': self.inputs['X'].max()}
else:
self.outputs = {
'Out': self.inputs['X'].max(axis=self.axis,
keepdims=self.attrs['keep_dim'])
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
paddle.enable_static()
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
def test_check_grad(self):
if paddle.is_compiled_with_xpu():
paddle.enable_static()
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['X'], 'Out')
def init_op_type(self):
self.op_type = "reduce_max"
self.use_mkldnn = False
self.keep_dim = False
self.reduce_all = False
def initTestCase(self):
self.shape = (5, 6, 10)
self.axis = (-1, )
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册