未验证 提交 a34f06a1 编写于 作者: H HappyAngel 提交者: GitHub

[OpenCL]Add leakyRelu and tanh op (#3048)

* fix leakyrelu and tanh compute error, test=develop

* delete extra file, test=develop

* reset act

* fix conflict and readme, test=develop

* fix ios run error, test=develop
上级 331aaecd
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
__kernel void relu(__read_only image2d_t input, __kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) { __write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
...@@ -33,7 +35,8 @@ __kernel void relu(__read_only image2d_t input, ...@@ -33,7 +35,8 @@ __kernel void relu(__read_only image2d_t input,
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold){ __private const float threshold,
__private const float scale){
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
...@@ -50,7 +53,9 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -50,7 +53,9 @@ __kernel void relu6(__read_only image2d_t input,
__kernel void sigmoid(__read_only image2d_t input, __kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) { __write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height const int y = get_global_id(1); // image_height
...@@ -63,3 +68,48 @@ __kernel void sigmoid(__read_only image2d_t input, ...@@ -63,3 +68,48 @@ __kernel void sigmoid(__read_only image2d_t input,
CL_DTYPE4 out = 1 / (1 + exp(-in)); CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void leaky_relu(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
if (in.x < 0.0f){
in.x = s_val.x;
}
if (in.y < 0.0f){
in.y = s_val.y;
}
if (in.z < 0.0f){
in.z = s_val.z;
}
if (in.w < 0.0f){
in.w = s_val.w;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void tanhAct(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
...@@ -161,4 +161,4 @@ __kernel void concat_mul(__read_only image2d_t input, ...@@ -161,4 +161,4 @@ __kernel void concat_mul(__read_only image2d_t input,
output_pos.x = in_c * out_W + (in_w + C_0); output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
} }
} }
\ No newline at end of file
...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include ...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) #CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
############################################################### ###############################################################
# How to use one of static libaray: # # How to use one of static libaray: #
...@@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS ...@@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
# 1. Comment above line using `libpaddle_light_api_shared.so` # 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a` # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
......
...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include ...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) #CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
############################################################### ###############################################################
# How to use one of static libaray: # # How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` # # `libpaddle_api_full_bundled.a` #
...@@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS ...@@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
# 1. Comment above line using `libpaddle_light_api_shared.so` # 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a` # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
......
# 图像预测库的使用 # 图像预测库的使用
1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish or tiny_publish模式
example: example:
```shell ```shell
set BUILD_WITH_CV=ON or LITE_WITH_CV=ON set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
...@@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON ...@@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
--arm_abi=armv8 --arm_abi=armv8
--arm_lang=gcc --arm_lang=gcc
--android_stl=c++_static --android_stl=c++_static
full_publish tiny_publish
``` ```
2. 准备模型和优化模型 2. 准备模型和优化模型
...@@ -68,7 +68,8 @@ make ...@@ -68,7 +68,8 @@ make
adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push mobilenet_v1 /data/local/tmp/
adb -s device_id push test_model_cv /data/local/tmp/ adb -s device_id push test_model_cv /data/local/tmp/
adb -s device_id push test.jpg /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " /data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
...@@ -119,7 +120,8 @@ make ...@@ -119,7 +120,8 @@ make
adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push mobilenet_v1 /data/local/tmp/
adb -s device_id push test_img_propress /data/local/tmp/ adb -s device_id push test_img_propress /data/local/tmp/
adb -s device_id push test.jpg /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " /data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 "
......
...@@ -25,18 +25,43 @@ namespace lite { ...@@ -25,18 +25,43 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL), class ActivationComputeImageDefault
PRECISION(kFP16), : public KernelLite<TARGET(kOpenCL),
DATALAYOUT(kImageDefault)> { PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16"; return "Activation using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type);
switch (act_type) {
case 1:
kernel_func_name_ = "relu";
break;
case 2:
kernel_func_name_ = "relu6";
threshold_ = act_param_->Relu_clipped_coef;
break;
case 4:
kernel_func_name_ = "leaky_relu";
scale_ = act_param_->Leaky_relu_alpha;
break;
case 5:
kernel_func_name_ = "sigmoid";
break;
case 6:
kernel_func_name_ = "tanhAct";
break;
default:
printf("This act type: %d doesn't support \n", act_type);
return;
}
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
} }
...@@ -61,74 +86,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -61,74 +86,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold_);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_); status = kernel.setArg(++arg_idx, scale_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
...@@ -139,7 +99,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -139,7 +99,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold; VLOG(4) << "threshold:" << threshold_;
VLOG(4) << "scale:" << scale_;
VLOG(4) << "kernel func name:" << kernel_func_name_;
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
...@@ -156,94 +118,60 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -156,94 +118,60 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
} }
private: private:
std::string kernel_func_name_{"relu6"}; param_t* act_param_{nullptr};
std::string build_options_{"-DCL_DTYPE_half -DRELU6"}; std::string kernel_func_name_{};
float threshold_{6.f};
float scale_{1.f};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class SigmoidComputeImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img =
param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float
image_shape["width"],
image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_half -DSIGMOID"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// leakyRelu
REGISTER_LITE_KERNEL(
leaky_relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// tanh
REGISTER_LITE_KERNEL(
tanhAct,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu // Relu
REGISTER_LITE_KERNEL(relu, REGISTER_LITE_KERNEL(
kOpenCL, relu,
kFP16, kOpenCL,
kImageDefault, kFP16,
paddle::lite::kernels::opencl::ReluComputeImageDefault, kImageDefault,
ImageDefault) paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -255,12 +183,13 @@ REGISTER_LITE_KERNEL(relu, ...@@ -255,12 +183,13 @@ REGISTER_LITE_KERNEL(relu,
.Finalize(); .Finalize();
// Relu6 // Relu6
REGISTER_LITE_KERNEL(relu6, REGISTER_LITE_KERNEL(
kOpenCL, relu6,
kFP16, kOpenCL,
kImageDefault, kFP16,
paddle::lite::kernels::opencl::Relu6ComputeImageDefault, kImageDefault,
ImageDefault) paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -272,12 +201,13 @@ REGISTER_LITE_KERNEL(relu6, ...@@ -272,12 +201,13 @@ REGISTER_LITE_KERNEL(relu6,
.Finalize(); .Finalize();
// Sigmoid // Sigmoid
REGISTER_LITE_KERNEL(sigmoid, REGISTER_LITE_KERNEL(
kOpenCL, sigmoid,
kFP16, kOpenCL,
kImageDefault, kFP16,
paddle::lite::kernels::opencl::SigmoidComputeImageDefault, kImageDefault,
ImageDefault) paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -103,7 +103,6 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc, ...@@ -103,7 +103,6 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
......
...@@ -830,6 +830,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -830,6 +830,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vg = vdup_n_u8(g); uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r); uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__ #ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else #else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
...@@ -925,7 +928,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -925,7 +928,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2), [outr2] "+r"(outr2),
[outr3] "+r"(outr3), [outr3] "+r"(outr3),
[cnt] "+r"(cnt) [cnt] "+r"(cnt)
: [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) : [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1)
: "cc", : "cc",
"memory", "memory",
"v0", "v0",
...@@ -1104,6 +1107,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1104,6 +1107,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vg = vdup_n_u8(g); uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r); uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__ #ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else #else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
...@@ -1199,7 +1205,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1199,7 +1205,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2), [outr2] "+r"(outr2),
[outr3] "+r"(outr3), [outr3] "+r"(outr3),
[cnt] "+r"(cnt) [cnt] "+r"(cnt)
: [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) : [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1)
: "cc", : "cc",
"memory", "memory",
"v0", "v0",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册