未验证 提交 64f29fbb 编写于 作者: Q QingshuChen 提交者: GitHub

update kunlun conv2d/softmax/elementwise implemetation (#29229)

* update conv2d & softmax to new xpu api
* test=kunlun

* remove useless comments
* test=kunlun

* remote softmax xpu op
* test=kunlun

* update kunlun softmax
* test=kunlun

* update xpu unitest
* test=kunlun

* fix elementwise_grad bug for kunlun
*test=kunlun
上级 b11ab127
......@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_30.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
......@@ -27,10 +27,6 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
// that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
// Tensor* max_input = context.Output<Tensor>("MaxInput");
// Tensor* max_filter = context.Output<Tensor>("MaxFilter");
// max_input->mutable_data<T>(context.GetPlace());
// max_filter->mutable_data<T>(context.GetPlace());
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
......@@ -43,52 +39,18 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
const int f = static_cast<int>(filter.dims()[0]);
const int win_h = static_cast<int>(filter.dims()[2]);
const int win_w = static_cast<int>(filter.dims()[3]);
PADDLE_ENFORCE_EQ(
dilations[0] == 1 && dilations[1] == 1, true,
platform::errors::InvalidArgument("XPU only support dilation == 1."));
auto& dev_ctx = context.template device_context<DeviceContext>();
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
// max_input->data<T>()) == xpu::Error_t::SUCCESS,
// true, platform::errors::InvalidArgument(
// "XPU conv kernel error,can not finde max_input,please "
// "check whether Baidu Kunlun "
// "Card is properly installed."));
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
// max_filter->data<T>()) == xpu::Error_t::SUCCESS,
// true, platform::errors::InvalidArgument(
// "XPU conv kernel error,can not find max_filter,please "
// "check whether Baidu Kunlun "
// "Card is properly installed."));
if (groups == 1) {
int r = xpu::conv2d_forward_int16<float, float, float, float>(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, input->data<float>(), filter.data<float>(),
output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
nullptr, nullptr);
// max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
} else {
int r = xpu::conv2d_int16_with_group<float, float, float>(
dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
nullptr, nullptr);
// max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
std::vector<int> k_size;
k_size.push_back(win_h);
k_size.push_back(win_w);
int r = xpu::conv2d<float, float, float, int16_t>(
dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
output->data<float>(), batch_size, img_c, img_h, img_w, f, k_size,
strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
};
template <typename DeviceContext, typename T>
......@@ -96,9 +58,6 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
// const Tensor* max_input = context.Input<Tensor>("MaxInput");
// const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
// Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
......@@ -115,11 +74,6 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
const int batch_size = static_cast<int>(input->dims()[0]);
PADDLE_ENFORCE_EQ(groups == 1, true, platform::errors::InvalidArgument(
"XPU only support groups == 1."));
PADDLE_ENFORCE_EQ(
dilations[0] == 1 && dilations[1] == 1, true,
platform::errors::InvalidArgument("XPU only support dilation == 1."));
const int img_c = static_cast<int>(input->dims()[1]);
const int img_h = static_cast<int>(input->dims()[2]);
const int img_w = static_cast<int>(input->dims()[3]);
......@@ -133,52 +87,24 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
filter_grad->mutable_data<T>(context.GetPlace());
}
auto& dev_ctx = context.template device_context<DeviceContext>();
// max_output_grad->Resize({4});
// max_output_grad->mutable_data<T>(context.GetPlace());
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
// output_grad->numel(),
// max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
// true,
// platform::errors::External(
// "XPU conv kernel error, can not find max_output_grad, please
// check "
// "whether Baidu Kunlun Card is "
// "properly installed."));
if (input_grad) {
int r = xpu::conv2d_backward_int16(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, output_grad->data<float>(),
filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
// max_output_grad->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
if (filter_grad) {
int r = xpu::conv2d_backward_weight_int16(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, output_grad->data<float>(),
input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
// max_output_grad->data<float>(), max_input->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
std::vector<int> k_size;
k_size.push_back(win_h);
k_size.push_back(win_w);
int r = xpu::conv2d_grad<float, float, float, int16_t>(
dev_ctx.x_context(), input->data<T>(), filter.data<T>(),
output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr,
filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c,
img_h, img_w, f, k_size, strides, paddings, dilations, groups, nullptr,
nullptr, nullptr, nullptr, nullptr, true);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_XPU_KERNEL(
depthwise_conv2d,
ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
......@@ -187,4 +113,7 @@ REGISTER_OP_XPU_KERNEL(
REGISTER_OP_XPU_KERNEL(
conv2d_grad,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif
......@@ -65,7 +65,7 @@ static std::pair<std::vector<int>, std::vector<int>> XPUReducesAxisVector(
}
int yidx = 0;
for (size_t i = 0; i < x_vector.size(); ++i) {
if (y[yidx] == 1) {
if (yidx >= y.size() || y[yidx] == 1) {
axis_v.push_back(i);
yidx++;
continue;
......@@ -134,10 +134,10 @@ void XPUElementwise(
std::pair<std::vector<int>, std::vector<int>> bcast_v =
XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);
ret = xpu::broadcast<T>(
dev_ctx.x_context(), x_data,
x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
bcast_v.first, bcast_v.second);
ret = xpu::broadcast<T>(dev_ctx.x_context(), x_data,
x_broadcast_tensor.mutable_data<T>(
ctx.GetPlace(), z->numel() * sizeof(T)),
bcast_v.first, bcast_v.second);
PADDLE_ENFORCE_EQ(
ret, xpu::SUCCESS,
platform::errors::External(
......@@ -153,10 +153,10 @@ void XPUElementwise(
std::vector<int> bcast_y_v;
std::pair<std::vector<int>, std::vector<int>> bcast_v =
XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
ret = xpu::broadcast<T>(
dev_ctx.x_context(), y_data,
y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
bcast_v.first, bcast_v.second);
ret = xpu::broadcast<T>(dev_ctx.x_context(), y_data,
y_broadcast_tensor.mutable_data<T>(
ctx.GetPlace(), z->numel() * sizeof(T)),
bcast_v.first, bcast_v.second);
PADDLE_ENFORCE_EQ(
ret, xpu::SUCCESS,
platform::errors::External(
......@@ -231,13 +231,15 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
bool dx_need_reduce = (dx != nullptr) && (dx->numel() != len);
bool dy_need_reduce = (dy != nullptr) && (dy->numel() != len);
T* dx_data = ((dx == nullptr) || dx_need_reduce)
? (dx_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
: (dx->mutable_data<T>(ctx.GetPlace()));
T* dx_data =
((dx == nullptr) || dx_need_reduce)
? (dx_local_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)))
: (dx->mutable_data<T>(ctx.GetPlace()));
T* dy_data = ((dy == nullptr) || dy_need_reduce)
? (dy_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
: (dy->mutable_data<T>(ctx.GetPlace()));
T* dy_data =
((dy == nullptr) || dy_need_reduce)
? (dy_local_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)))
: (dy->mutable_data<T>(ctx.GetPlace()));
int ret = xpu::SUCCESS;
auto& dev_ctx =
......@@ -250,8 +252,8 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);
ret = xpu::broadcast<T>(
dev_ctx.x_context(), x_data,
x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
bcast_v.second);
x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)),
bcast_v.first, bcast_v.second);
PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
platform::errors::External(
"XPU kernel broadcast error occur! %d", ret));
......@@ -267,8 +269,8 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
ret = xpu::broadcast<T>(
dev_ctx.x_context(), y_data,
y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
bcast_v.second);
y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)),
bcast_v.first, bcast_v.second);
PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
platform::errors::External(
"XPU kernel broadcast error occur! %d", ret));
......@@ -287,9 +289,9 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
const framework::DDim& dx_dims = dx->dims();
std::pair<std::vector<int>, std::vector<int>> reduce_v =
XPUReducesAxisVector(out_dim, dx_dims);
ret = xpu::reduce_sum(dev_ctx.x_context(), dx_data,
dx->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
reduce_v.second);
ret = xpu::reduce_sum<T>(dev_ctx.x_context(), dx_data,
dx->mutable_data<T>(ctx.GetPlace()),
reduce_v.first, reduce_v.second);
PADDLE_ENFORCE_EQ(
ret, xpu::SUCCESS,
platform::errors::External("XPU kernel reduce_sum occur error in "
......@@ -302,9 +304,9 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
const framework::DDim& dy_dims = dy->dims();
std::pair<std::vector<int>, std::vector<int>> reduce_v =
XPUReducesAxisVector(out_dim, dy_dims);
ret = xpu::reduce_sum(dev_ctx.x_context(), dy_data,
dy->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
reduce_v.second);
ret = xpu::reduce_sum<T>(dev_ctx.x_context(), dy_data,
dy->mutable_data<T>(ctx.GetPlace()),
reduce_v.first, reduce_v.second);
PADDLE_ENFORCE_EQ(
ret, xpu::SUCCESS,
platform::errors::External("XPU kernel reduce_sum occur error in "
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -30,29 +27,27 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
const int rank = x->dims().size();
const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
platform::errors::InvalidArgument(
"xpu softmax kernel only support last dimension of x "
"(axis==-1 or axis==x_dims-1), but received axis: "
"%d, x's shape: %s.",
axis, x->dims()));
int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
// allocate memory on device.
out->mutable_data<T>(context.GetPlace());
const int n = SizeToAxis(axis, x->dims());
const int d = SizeFromAxis(axis, x->dims());
std::vector<int> x_dims;
for (int i = 0; i < rank; i++) {
x_dims.push_back(x->dims()[i]);
}
if (axis < 0) {
axis += rank;
}
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
out->data<float>(), n, d, d <= 2048);
int r = xpu::softmax<T>(dev_ctx.x_context(), x->data<float>(),
out->data<float>(), x_dims, axis);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(softmax2d_forward) return wrong "
"value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
};
......@@ -64,24 +59,28 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
const int rank = dx->dims().size();
const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
// allocate memory on device.
dx->mutable_data<T>(context.GetPlace());
const int n = SizeToAxis(axis, dx->dims());
const int d = SizeFromAxis(axis, dx->dims());
std::vector<int> x_dims;
for (int i = 0; i < rank; i++) {
x_dims.push_back(dx->dims()[i]);
}
if (axis < 0) {
axis += rank;
}
auto& dev_ctx = context.template device_context<DeviceContext>();
int r =
xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
dout->data<float>(), dx->data<float>(), n, d);
int r = xpu::softmax_grad<T>(dev_ctx.x_context(), out->data<float>(),
dout->data<float>(), dx->data<float>(), x_dims,
axis);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(softmax2d_backward) return wrong "
"value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
};
......
......@@ -15,6 +15,7 @@
#pragma once
#ifdef PADDLE_WITH_XPU
#include <map>
#include <string>
#include <unordered_map>
......@@ -48,4 +49,11 @@ class XPUActHelper {
return res->second;
}
};
static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::SUCCESS, "xpu api success"},
{xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
#endif
......@@ -1915,6 +1915,10 @@ def load(program, model_path, executor=None, var_list=None):
place = paddle.fluid.CPUPlace()
elif p.is_cuda_pinned_place():
place = paddle.fluid.CUDAPinnedPlace()
elif p.is_xpu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id())
else:
p = paddle.fluid.core.Place()
p.set_place(t._place())
......
......@@ -362,17 +362,6 @@ class XPUOpTest(OpTest):
if not type(output_names) is list:
output_names = [output_names]
numeric_grads = user_defined_grads or [
get_numeric_gradient(
place,
self.scope,
self.op,
self.inputs,
input_to_check,
output_names,
delta=numeric_grad_delta,
in_place=in_place) for input_to_check in inputs_to_check
]
analytic_grads = self._get_gradient(inputs_to_check, place,
output_names, no_grad_set)
return analytic_grads
......@@ -13,6 +13,9 @@
# limitations under the License.
from __future__ import print_function
import sys
sys.path.append("..")
from test_softmax_op import stable_softmax
from op_test import OpTest
import paddle.fluid.core as core
......@@ -20,8 +23,6 @@ import paddle
import unittest
import numpy as np
import sys
sys.path.append("..")
def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册