未验证 提交 a34f06a1 编写于 作者: H HappyAngel 提交者: GitHub

[OpenCL]Add leakyRelu and tanh op (#3048)

* fix leakyrelu and tanh compute error, test=develop

* delete extra file, test=develop

* reset act

* fix conflict and readme, test=develop

* fix ios run error, test=develop
上级 331aaecd
......@@ -16,7 +16,9 @@ limitations under the License. */
__kernel void relu(__read_only image2d_t input,
__write_only image2d_t output) {
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
......@@ -33,7 +35,8 @@ __kernel void relu(__read_only image2d_t input,
__kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold){
__private const float threshold,
__private const float scale){
const int x = get_global_id(0);
const int y = get_global_id(1);
......@@ -50,7 +53,9 @@ __kernel void relu6(__read_only image2d_t input,
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output) {
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
......@@ -63,3 +68,48 @@ __kernel void sigmoid(__read_only image2d_t input,
CL_DTYPE4 out = 1 / (1 + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
__kernel void leaky_relu(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
if (in.x < 0.0f){
in.x = s_val.x;
}
if (in.y < 0.0f){
in.y = s_val.y;
}
if (in.z < 0.0f){
in.z = s_val.z;
}
if (in.w < 0.0f){
in.w = s_val.w;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
__kernel void tanhAct(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
......@@ -161,4 +161,4 @@ __kernel void concat_mul(__read_only image2d_t input,
output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}
}
\ No newline at end of file
}
......@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
......@@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
......
......@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
......@@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
......
# 图像预测库的使用
1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式
1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish or tiny_publish模式
example:
```shell
set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
......@@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
--arm_abi=armv8
--arm_lang=gcc
--android_stl=c++_static
full_publish
tiny_publish
```
2. 准备模型和优化模型
......@@ -68,7 +68,8 @@ make
adb -s device_id push mobilenet_v1 /data/local/tmp/
adb -s device_id push test_model_cv /data/local/tmp/
adb -s device_id push test.jpg /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
......@@ -119,7 +120,8 @@ make
adb -s device_id push mobilenet_v1 /data/local/tmp/
adb -s device_id push test_img_propress /data/local/tmp/
adb -s device_id push test.jpg /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 "
......
......@@ -25,18 +25,43 @@ namespace lite {
namespace kernels {
namespace opencl {
class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
class ActivationComputeImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
return "Activation using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type);
switch (act_type) {
case 1:
kernel_func_name_ = "relu";
break;
case 2:
kernel_func_name_ = "relu6";
threshold_ = act_param_->Relu_clipped_coef;
break;
case 4:
kernel_func_name_ = "leaky_relu";
scale_ = act_param_->Leaky_relu_alpha;
break;
case 5:
kernel_func_name_ = "sigmoid";
break;
case 6:
kernel_func_name_ = "tanhAct";
break;
default:
printf("This act type: %d doesn't support \n", act_type);
return;
}
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
......@@ -61,74 +86,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
status = kernel.setArg(++arg_idx, threshold_);
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
}
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_half -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
status = kernel.setArg(++arg_idx, scale_);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
......@@ -139,7 +99,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
VLOG(4) << "threshold:" << threshold_;
VLOG(4) << "scale:" << scale_;
VLOG(4) << "kernel func name:" << kernel_func_name_;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
......@@ -156,94 +118,60 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
}
private:
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
param_t* act_param_{nullptr};
std::string kernel_func_name_{};
float threshold_{6.f};
float scale_{1.f};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class SigmoidComputeImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img =
param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float
image_shape["width"],
image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
}
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_half -DSIGMOID"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// leakyRelu
REGISTER_LITE_KERNEL(
leaky_relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// tanh
REGISTER_LITE_KERNEL(
tanhAct,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ReluComputeImageDefault,
ImageDefault)
REGISTER_LITE_KERNEL(
relu,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......@@ -255,12 +183,13 @@ REGISTER_LITE_KERNEL(relu,
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeImageDefault,
ImageDefault)
REGISTER_LITE_KERNEL(
relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......@@ -272,12 +201,13 @@ REGISTER_LITE_KERNEL(relu6,
.Finalize();
// Sigmoid
REGISTER_LITE_KERNEL(sigmoid,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::SigmoidComputeImageDefault,
ImageDefault)
REGISTER_LITE_KERNEL(
sigmoid,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......
......@@ -103,7 +103,6 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
......
......@@ -830,6 +830,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
......@@ -925,7 +928,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2),
[outr3] "+r"(outr3),
[cnt] "+r"(cnt)
: [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
: [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1)
: "cc",
"memory",
"v0",
......@@ -1104,6 +1107,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
......@@ -1199,7 +1205,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2),
[outr3] "+r"(outr3),
[cnt] "+r"(cnt)
: [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
: [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1)
: "cc",
"memory",
"v0",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册