未验证 提交 d951f3af 编写于 作者: H houj04 提交者: GitHub

swish and pow op for xpu test=kunlun (#40654)

* swish and pow op for xpu. test=kunlun

* fix code style. test=kunlun.

* use pow_grad xdnn api. test=kunlun.
上级 f12b5260
......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220324")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220327")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -340,29 +340,56 @@ struct XPUPowFunctor : public BaseActivationFunctor<T> {
auto pow_factor = ctx.Attr<float>("factor");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
T *factor_data = nullptr;
// allocate temp memory for factor on xpu
auto xpu_context =
ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
x->numel() * sizeof(T)),
XPU_SUCCESS, platform::errors::ResourceExhausted(
"XPU has no enough memory"));
int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s] in pow op.",
r, XPUAPIErrorMsg[r]));
r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU pow op return wrong value[%d %s].", r,
XPUAPIErrorMsg[r]));
if (xpu_context->xpu_stream != nullptr) {
xpu_wait(xpu_context->xpu_stream);
xpu::ctx_guard RAII_GUARD(xpu_context);
T *factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
PADDLE_ENFORCE_NOT_NULL(
factor_data,
platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
memory::Copy(ctx.GetPlace(), static_cast<void *>(factor_data),
platform::CPUPlace(), static_cast<void *>(&pow_factor),
sizeof(T));
// broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const
// std::vector<int>& xshape, const std::vector<int>& yshape);
auto x_dims = phi::vectorize<int>(x->dims());
int r = xpu::broadcast_pow(xpu_context, x_data, factor_data, y_data, x_dims,
{1});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
}
xpu_free(factor_data);
};
template <typename T>
struct XPUPowGradFunctor : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
const auto *x = ctx.Input<Tensor>("X");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = x->data<T>();
const T *y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
// check dims: all dims should equal
auto x_dims = phi::vectorize<int>(x->dims());
auto dy_dims = phi::vectorize<int>(dOut->dims());
auto dx_dims = phi::vectorize<int>(dX->dims());
PADDLE_ENFORCE_EQ(x_dims, dy_dims, platform::errors::PreconditionNotMet(
"x_dims should match dy_dims."));
PADDLE_ENFORCE_EQ(x_dims, dx_dims, platform::errors::PreconditionNotMet(
"x_dims should match dx_dims."));
float pow_factor = ctx.Attr<float>("factor");
auto xpu_context =
ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
// int pow_grad(Context* ctx, const T* x, const T* dy, T* dx, int len, float
// factor);
int r = xpu::pow_grad(xpu_context, x_data, y_grad, x_grad, x->numel(),
pow_factor);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow_grad");
}
};
......@@ -410,6 +437,40 @@ struct XPUSoftPlusGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct XPUSwishFunctor : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
auto xpu_context =
ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
// int swish(Context* ctx, const T* x, T* y, int len);
int r = xpu::swish(xpu_context, x_data, y_data, x->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish");
}
};
template <typename T>
struct XPUSwishGradFunctor : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
const auto *x = ctx.Input<Tensor>("X");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = x->data<T>();
const T *y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
auto xpu_context =
ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
// int swish_grad(Context* ctx, const T* x, const T* dy, T* dx, int len);
int r = xpu::swish_grad(xpu_context, x_data, y_grad, x_grad, dX->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad");
}
};
} // namespace operators
} // namespace paddle
......@@ -436,6 +497,8 @@ REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor,
XPUSoftPlusGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor)
REGISTER_OP_XPU_KERNEL(
tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
......@@ -449,7 +512,5 @@ REGISTER_OP_XPU_KERNEL(exp,
ops::XPUActivationKernel<ops::XPUExpFunctor<float>>);
REGISTER_OP_XPU_KERNEL(log,
ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
REGISTER_OP_XPU_KERNEL(pow,
ops::XPUActivationKernel<ops::XPUPowFunctor<float>>);
#endif // PADDLE_WITH_XPU
......@@ -259,6 +259,8 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace())})},
{"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"pow_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
......@@ -352,6 +354,8 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::INT32, XPUPlace())})},
{"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
......
......@@ -349,18 +349,55 @@ class XPUTestPowOP(XPUOpTestWrapper):
self.op_name = 'pow'
self.use_dynamic_create_class = False
class XPUTestPow(TestActivationOPBase):
class XPUTestPowBase(TestActivationOPBase):
def set_case(self):
self.op_type = "pow"
self.dtype = self.in_type
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
out = np.power(x, 3)
self.init_config()
out = np.power(self.x, self.factor)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'factor': 3.0, 'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
self.attrs = {'factor': self.factor, 'use_xpu': True}
self.outputs = {'Out': out}
def init_config(self):
self.x = np.random.uniform(-1, 2, [12]).astype(self.dtype)
self.factor = 3.0
class XPUTestPow1(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype)
self.factor = 1
class XPUTestPow2(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype)
self.factor = 2
class XPUTestPow3(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(-2, 2,
[4, 512, 15, 15]).astype(self.dtype)
self.factor = 3
class XPUTestPow4(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(-2, 2,
[4, 256, 22, 22]).astype(self.dtype)
self.factor = 4
class XPUTestPow5(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(0, 1,
[4, 256, 22, 22]).astype(self.dtype)
self.factor = 1.2
class XPUTestPow6(XPUTestPowBase):
def init_config(self):
self.x = np.random.uniform(0, 1, [1024, 8]).astype(self.dtype)
self.factor = 3.2
support_types = get_xpu_op_support_types('pow')
for stype in support_types:
......@@ -886,19 +923,35 @@ class XPUTestSwishOP(XPUOpTestWrapper):
self.op_name = 'swish'
self.use_dynamic_create_class = False
class XPUTestSwish(TestActivationOPBase):
class XPUTestSwishBase(TestActivationOPBase):
def set_case(self):
self.op_type = "swish"
self.dtype = self.in_type
np.random.seed(1024)
x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
out = ref_swish(x)
self.init_config()
out = ref_swish(self.x)
self.inputs = {'X': x}
self.inputs = {'X': self.x}
self.outputs = {'Out': out}
self.attrs = {'use_xpu': True}
def init_config(self):
self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
class XPUTestSwish2(XPUTestSwishBase):
def init_config(self):
self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype)
class XPUTestSwish3(XPUTestSwishBase):
def init_config(self):
self.x = np.random.uniform(-2, 2,
[4, 512, 15, 15]).astype(self.dtype)
class XPUTestSwish4(XPUTestSwishBase):
def init_config(self):
self.x = np.random.uniform(-2, 2,
[4, 256, 22, 22]).astype(self.dtype)
support_types = get_xpu_op_support_types('swish')
for stype in support_types:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册