diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index dbc629a3e3fd2f3b5d99bba78954cd00986310af..0d340ab638b1a53de5eef690fb0349c03f9b4e30 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220324") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220327") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 62fb98b63a837d27e034058b32993db9ae583d5e..4c2d3fc162f832e9dafbc39784a233f23a6886be 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -340,29 +340,56 @@ struct XPUPowFunctor : public BaseActivationFunctor { auto pow_factor = ctx.Attr("factor"); const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); - T *factor_data = nullptr; + // allocate temp memory for factor on xpu auto xpu_context = ctx.device_context().x_context(); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&factor_data), - x->numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); - int r = xpu::constant(xpu_context, factor_data, x->numel(), pow_factor); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s] in pow op.", - r, XPUAPIErrorMsg[r])); - r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU pow op return wrong value[%d %s].", r, - XPUAPIErrorMsg[r])); - if (xpu_context->xpu_stream != nullptr) { - xpu_wait(xpu_context->xpu_stream); - } - xpu_free(factor_data); + xpu::ctx_guard RAII_GUARD(xpu_context); + T *factor_data = RAII_GUARD.alloc_l3_or_gm(1); + PADDLE_ENFORCE_NOT_NULL( + factor_data, + platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); + memory::Copy(ctx.GetPlace(), static_cast(factor_data), + platform::CPUPlace(), static_cast(&pow_factor), + sizeof(T)); + + // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const + // std::vector& xshape, const std::vector& yshape); + auto x_dims = phi::vectorize(x->dims()); + int r = xpu::broadcast_pow(xpu_context, x_data, factor_data, y_data, x_dims, + {1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); + } +}; + +template +struct XPUPowGradFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + auto *dX = ctx.Output(framework::GradVarName("X")); + + const T *x_data = x->data(); + const T *y_grad = dOut->data(); + T *x_grad = dX->mutable_data(ctx.GetPlace()); + + // check dims: all dims should equal + auto x_dims = phi::vectorize(x->dims()); + auto dy_dims = phi::vectorize(dOut->dims()); + auto dx_dims = phi::vectorize(dX->dims()); + PADDLE_ENFORCE_EQ(x_dims, dy_dims, platform::errors::PreconditionNotMet( + "x_dims should match dy_dims.")); + PADDLE_ENFORCE_EQ(x_dims, dx_dims, platform::errors::PreconditionNotMet( + "x_dims should match dx_dims.")); + float pow_factor = ctx.Attr("factor"); + + auto xpu_context = + ctx.device_context().x_context(); + // int pow_grad(Context* ctx, const T* x, const T* dy, T* dx, int len, float + // factor); + int r = xpu::pow_grad(xpu_context, x_data, y_grad, x_grad, x->numel(), + pow_factor); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow_grad"); } }; @@ -410,6 +437,40 @@ struct XPUSoftPlusGradFunctor : public BaseActivationFunctor { } }; +template +struct XPUSwishFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); + const T *x_data = x->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + + auto xpu_context = + ctx.device_context().x_context(); + // int swish(Context* ctx, const T* x, T* y, int len); + int r = xpu::swish(xpu_context, x_data, y_data, x->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish"); + } +}; + +template +struct XPUSwishGradFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + auto *dX = ctx.Output(framework::GradVarName("X")); + const T *x_data = x->data(); + const T *y_grad = dOut->data(); + T *x_grad = dX->mutable_data(ctx.GetPlace()); + + auto xpu_context = + ctx.device_context().x_context(); + // int swish_grad(Context* ctx, const T* x, const T* dy, T* dx, int len); + int r = xpu::swish_grad(xpu_context, x_data, y_grad, x_grad, dX->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad"); + } +}; + } // namespace operators } // namespace paddle @@ -436,6 +497,8 @@ REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor, XPUSoftPlusGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor) REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, @@ -449,7 +512,5 @@ REGISTER_OP_XPU_KERNEL(exp, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL(log, ops::XPUActivationKernel>); -REGISTER_OP_XPU_KERNEL(pow, - ops::XPUActivationKernel>); #endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 3feb33e4ac9b120801e3608fa8f629073b400bcc..194920db9cca9df903b83bb13e546f17915c566d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -259,6 +259,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pow_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, @@ -352,6 +354,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace())})}, {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 66f2e871dac462c8e6e47357e7367755d2fc0cfc..9e2825ab631f0731d0199cc366ba243b615bc7ac 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -349,18 +349,55 @@ class XPUTestPowOP(XPUOpTestWrapper): self.op_name = 'pow' self.use_dynamic_create_class = False - class XPUTestPow(TestActivationOPBase): + class XPUTestPowBase(TestActivationOPBase): def set_case(self): self.op_type = "pow" self.dtype = self.in_type - x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - out = np.power(x, 3) + self.init_config() + out = np.power(self.x, self.factor) - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {'factor': 3.0, 'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} + self.attrs = {'factor': self.factor, 'use_xpu': True} self.outputs = {'Out': out} + def init_config(self): + self.x = np.random.uniform(-1, 2, [12]).astype(self.dtype) + self.factor = 3.0 + + class XPUTestPow1(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype) + self.factor = 1 + + class XPUTestPow2(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype) + self.factor = 2 + + class XPUTestPow3(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, + [4, 512, 15, 15]).astype(self.dtype) + self.factor = 3 + + class XPUTestPow4(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, + [4, 256, 22, 22]).astype(self.dtype) + self.factor = 4 + + class XPUTestPow5(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(0, 1, + [4, 256, 22, 22]).astype(self.dtype) + self.factor = 1.2 + + class XPUTestPow6(XPUTestPowBase): + def init_config(self): + self.x = np.random.uniform(0, 1, [1024, 8]).astype(self.dtype) + self.factor = 3.2 + support_types = get_xpu_op_support_types('pow') for stype in support_types: @@ -886,19 +923,35 @@ class XPUTestSwishOP(XPUOpTestWrapper): self.op_name = 'swish' self.use_dynamic_create_class = False - class XPUTestSwish(TestActivationOPBase): + class XPUTestSwishBase(TestActivationOPBase): def set_case(self): self.op_type = "swish" self.dtype = self.in_type - np.random.seed(1024) - x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) - out = ref_swish(x) + self.init_config() + out = ref_swish(self.x) - self.inputs = {'X': x} + self.inputs = {'X': self.x} self.outputs = {'Out': out} self.attrs = {'use_xpu': True} + def init_config(self): + self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + + class XPUTestSwish2(XPUTestSwishBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype) + + class XPUTestSwish3(XPUTestSwishBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, + [4, 512, 15, 15]).astype(self.dtype) + + class XPUTestSwish4(XPUTestSwishBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, + [4, 256, 22, 22]).astype(self.dtype) + support_types = get_xpu_op_support_types('swish') for stype in support_types: