diff --git a/docs/demo_guides/cuda.md b/docs/demo_guides/cuda.md index 8b3e76acef590bda19a59388017added6a0b8d52..f863fd86864194c6d022e4cf1fc75eb46725cc2c 100644 --- a/docs/demo_guides/cuda.md +++ b/docs/demo_guides/cuda.md @@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib` 4、 `demo` 文件夹:c++ demo. -如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。 +如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。 ## 运行 @@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg 二: 运行 -**NOTE:**此处示例使用的是python接口。 +**NOTE:** 此处示例使用的是python接口。 ``` python #-*- coding: utf-8 -*- @@ -75,7 +75,7 @@ import sys import numpy as np import cv2 sys.path.append('build_cuda/inference_lite_lib/python/lib') -from lite_core import * +from lite import * def read_img(im_path, resize_h, resize_w): im = cv2.imread(im_path).astype('float32') diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index dfb0a7fa68579e24eac22a7edee89a8cf9e12d5c..b08f2f5c745f87cda2be181bdea2444b2c11313c 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -181,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase { #endif #ifdef LITE_WITH_CUDA void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } - int multi_stream() const { return multi_stream_; } + bool multi_stream() const { return multi_stream_; } #endif #ifdef LITE_WITH_MLU diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index e81bebe1a31656409ed718b29b956a7a66560248..8cb4dbf192993219347d70bb8ccb704199b45f3d 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -52,6 +52,7 @@ USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); +USE_MIR_PASS(lite_scale_activation_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 5aad98c05c56f85931b7a0276d0a85b426573c4c..aab1058b9dd66522a0793fc151c54707505d1fbb 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -27,31 +27,467 @@ void scale( int remain = num % 16; float32x4_t vscale = vdupq_n_f32(scale); float32x4_t vbias = vdupq_n_f32(bias); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + dout++; + din++; + } + } +} + +template <> +void scale_relu( + const float* din, float* dout, int num, float scale, float bias) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b\n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b\n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_relu6(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "fmin v8.4s, v8.4s, %[valpha].4s \n" + "fmin v9.4s, v9.4s, %[valpha].4s \n" + "fmin v10.4s, v10.4s, %[valpha].4s \n" + "fmin v11.4s, v11.4s, %[valpha].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vmin.f32 q8, q8, %q[valpha] @ relu \n" + "vmin.f32 q9, q9, %q[valpha] @ relu \n" + "vmin.f32 q10, q10, %q[valpha] @ relu \n" + "vmin.f32 q11, q11, %q[valpha] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? (*dout < alpha ? *dout : alpha) : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_leaky_relu(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fcmge v12.4s, v8.4s, %[vzero].4s \n" + "fmul v16.4s, v8.4s, %[valpha].4s \n" + + "fcmge v13.4s, v9.4s, %[vzero].4s \n" + "fmul v17.4s, v9.4s, %[valpha].4s \n" + + "fcmge v14.4s, v10.4s, %[vzero].4s \n" + "fmul v18.4s, v10.4s, %[valpha].4s \n" + + "fcmge v15.4s, v11.4s, %[vzero].4s \n" + "fmul v19.4s, v11.4s, %[valpha].4s \n" + + "bif v8.16b, v16.16b, v12.16b \n" /* choose*/ + "bif v9.16b, v17.16b, v13.16b \n" /* choose*/ + "bif v10.16b, v18.16b, v14.16b \n" /* choose*/ + "bif v11.16b, v19.16b, v15.16b \n" /* choose*/ + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vcge.f32 q12, q8, %q[vzero] @ relu \n" + "vmul.f32 q14, q8, %q[valpha] @ mul \n" + "vcge.f32 q13, q9, %q[vzero] @ relu \n" + "vmul.f32 q15, q9, %q[valpha] @ mul \n" + "vbif q8, q14, q12 @ choose \n" + "vbif q9, q15, q13 @ choose \n" + + "vcge.f32 q12, q10, %q[vzero] @ relu \n" + "vmul.f32 q14, q10, %q[valpha] @ mul \n" + "vcge.f32 q13, q11, %q[vzero] @ relu \n" + "vmul.f32 q15, q11, %q[valpha] @ mul \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + + "vbif q10, q14, q12 @ choose \n" + "vbif q11, q15, q13 @ choose \n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : (*dout * alpha); + dout++; + din++; + } + } +} + +template <> +void scale(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_relu(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); #pragma omp parallel for for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); - float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); - float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); - float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); - float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); } if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); for (int i = 0; i < remain; i++) { *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : 0; dout_ptr++; din_ptr++; } @@ -59,11 +495,66 @@ void scale( } template <> -void scale(const int* din, int* dout, int num, int scale, int bias) { +void scale_relu6( + const int* din, int* dout, int num, int scale, int bias, int alpha) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); + + vsum1 = vminq_s32(vsum1, valpha); + vsum2 = vminq_s32(vsum2, valpha); + vsum3 = vminq_s32(vsum3, valpha); + vsum4 = vminq_s32(vsum4, valpha); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? (*dout_ptr > alpha ? alpha : *dout_ptr) : 0; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_leaky_relu( + const int* din, int* dout, int num, int scale, int bias, int alpha) { int cnt = num >> 4; int remain = num % 16; int32x4_t vscale = vdupq_n_s32(scale); int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); #pragma omp parallel for for (int i = 0; i < cnt; i++) { const int* din_ptr = din + (i << 4); @@ -79,16 +570,33 @@ void scale(const int* din, int* dout, int num, int scale, int bias) { int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + uint32x4_t v1 = vcgeq_s32(vsum1, vzero); + uint32x4_t v2 = vcgeq_s32(vsum2, vzero); + uint32x4_t v3 = vcgeq_s32(vsum3, vzero); + uint32x4_t v4 = vcgeq_s32(vsum4, vzero); + + int32x4_t v11 = vmulq_s32(vsum1, valpha); + int32x4_t v21 = vmulq_s32(vsum1, valpha); + int32x4_t v31 = vmulq_s32(vsum1, valpha); + int32x4_t v41 = vmulq_s32(vsum1, valpha); + + vsum1 = vbslq_s32(v1, vsum1, v11); + vsum2 = vbslq_s32(v2, vsum2, v21); + vsum3 = vbslq_s32(v3, vsum3, v31); + vsum4 = vbslq_s32(v4, vsum4, v41); + vst1q_s32(dout_ptr, vsum1); vst1q_s32(dout_ptr + 4, vsum2); vst1q_s32(dout_ptr + 8, vsum3); vst1q_s32(dout_ptr + 12, vsum4); } + if (remain > 0) { const int* din_ptr = din + (cnt << 4); int* dout_ptr = dout + (cnt << 4); for (int i = 0; i < remain; i++) { *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : (*dout_ptr) * alpha; dout_ptr++; din_ptr++; } diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h index 910bea5613997c05e9257507f8f84792e0071a53..bbdb596bc8f45c247a24f9833680c8a510c1e904 100644 --- a/lite/backends/arm/math/scale.h +++ b/lite/backends/arm/math/scale.h @@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) { template void scale(const T* din, T* dout, int num, T scale, T bias); +template +void scale_relu(const T* din, T* dout, int num, T scale, T bias); + +template +void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha); + +template +void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha); + template void scale(const T* din, T* dout, diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index a365fe3f7b8f04b3568fbf2c8f85af4e2469706c..b8234b18922f454c41e295209da13de024184adc 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -21,6 +21,7 @@ lite_cc_library(mir_passes fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc + fusion/scale_activation_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index 04a36976c7110c64ef781af12fc86fd4853fe583..a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate lite_cc_library(fuse_sequence_pool_concat SRCS sequence_pool_concat_fuser.cc DEPS pattern_matcher_high_api) +lite_cc_library(fuse_scale_activation + SRCS scale_activation_fuser.cc + DEPS pattern_matcher_high_api) set(mir_fusers fuse_fc @@ -44,6 +47,7 @@ set(mir_fusers fuse_transpose_softmax_transpose fuse_interpolate fuse_sequence_pool_concat + fuse_scale_activation CACHE INTERNAL "fusers") if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.cc b/lite/core/mir/fusion/scale_activation_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ad1f4994f6d5183d3b5c925bb222cb95ea064e8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuse_pass.h" +#include +#include +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void ScaleActivationFusePass::Apply(const std::unique_ptr& graph) { + for (auto act_type : {"relu", "relu6", "leaky_relu"}) { + fusion::ScaleActivationFuser fuser(act_type); + fuser(graph.get()); + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(lite_scale_activation_fuse_pass, + paddle::lite::mir::ScaleActivationFusePass) + .BindTargets({TARGET(kARM)}) + .BindKernel("scale"); diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.h b/lite/core/mir/fusion/scale_activation_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2118a0b6f396ff12855009a975059c95ee6111a8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +class ScaleActivationFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.cc b/lite/core/mir/fusion/scale_activation_fuser.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f18099da8bc97d9dab8f9c31fd6c23d42d67d81 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include +#include + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +void ScaleActivationFuser::BuildPattern() { + // create input nodes. + auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput(); + + // create op nodes + auto* scale = + OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate(); + auto* act = + OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate(); + + // create intermediate nodes + auto* scale_out = VarNode("scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + + // create output node + auto* out = + VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); + // create topology. + *x >> *scale >> *scale_out; + *scale_out >> *act >> *out; +} + +void ScaleActivationFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + auto op_desc = GenOpDesc(matched); + auto scale_op = LiteOpRegistry::Global().Create("scale"); + auto scale = matched.at("scale")->stmt()->op(); + auto* scope = scale->scope(); + auto& valid_places = scale->valid_places(); + scale_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places); + + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("output")); +} + +cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info(); + op_desc.SetOutput("Out", {matched.at("output")->arg()->name}); + cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info(); + + op_desc.SetAttr("activation_type", act_type_); + if (act_type_ == "relu") { + op_desc.SetAttr("fuse_relu", true); + } else if (act_type_ == "relu6") { + float alpha = act_op_desc.GetAttr("threshold"); + op_desc.SetAttr("alpha", alpha); + } else if (act_type_ == "leaky_relu") { + float alpha = act_op_desc.GetAttr("alpha"); + op_desc.SetAttr("alpha", alpha); + } + return op_desc; +} + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.h b/lite/core/mir/fusion/scale_activation_fuser.h new file mode 100644 index 0000000000000000000000000000000000000000..9fa9b9d2b5ebc5091b41a2ca244689797c97ccb6 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class ScaleActivationFuser : public FuseBase { + public: + explicit ScaleActivationFuser(const std::string& act_type) { + act_type_ = act_type; + } + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + std::string act_type_; +}; + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 9a42222e10b930ea805f4d6c048df6eba5509c86..35eba0e5ccc556369e81acfc0b6934726997d9b6 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -86,6 +86,7 @@ class Optimizer { "identity_scale_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", // "lite_sequence_pool_concat_fuse_pass", // + "lite_scale_activation_fuse_pass", // #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ (defined LITE_WITH_ARM) "lite_elementwise_add_activation_fuse_pass", // diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt index e27548b4e56ce03098c5c82b3eee49add62cc0a4..f057a1f189fdb92ff33f00d5ceacc83f7fc28c5d 100644 --- a/lite/demo/cxx/cuda_demo/CMakeLists.txt +++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt @@ -1,20 +1,24 @@ -project(demo CXX C) cmake_minimum_required(VERSION 2.8) +project(demo CXX C) + +add_definitions(-DLITE_WITH_CUDA) set(TARGET demo) set(CMAKE_CXX_FLAGS "-std=c++11 -O3") -set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx") -set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") +set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx") +set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") -include_directories("${LITE_LIB}/include") -link_directories("${LITE_LIB}/lib") -link_directories("${PROTOBUF_LIB}/lib") +include_directories("${LITE_ROOT}/include") +link_directories("${LITE_ROOT}/lib") +link_directories("${PROTOBUF_ROOT}/lib") +# cuda lib +link_directories("/usr/local/cuda/lib64/") add_executable(${TARGET} ${TARGET}.cc) -set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so) +set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so) set(DEPS ${DEPS} protobuf-lite) -set(DEPS ${DEPS} "-lrt -lpthread -ldl") +set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart") target_link_libraries(${TARGET} ${DEPS}) diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc index 71192d7b937116966a5b95a7620805065fdd152e..c6f91f209b42ea6f2f99a7741e90c0eb9103952b 100644 --- a/lite/kernels/arm/scale_compute.cc +++ b/lite/kernels/arm/scale_compute.cc @@ -31,7 +31,18 @@ void ScaleCompute::Run() { if (!param.bias_after_scale) { bias *= scale; } - lite::arm::math::scale(x_data, output_data, num, scale, bias); + T alpha = param.alpha; + if (param.activation_type == "") { // no act + lite::arm::math::scale(x_data, output_data, num, scale, bias); + } else if (param.activation_type == "relu") { // do relu + lite::arm::math::scale_relu(x_data, output_data, num, scale, bias); + } else if (param.activation_type == "relu6") { // do relu6 + lite::arm::math::scale_relu6( + x_data, output_data, num, scale, bias, alpha); + } else if (param.activation_type == "leaky_relu") { // do leaky_relu + lite::arm::math::scale_leaky_relu( + x_data, output_data, num, scale, bias, alpha); + } if (!param.x->lod().empty()) { param.output->set_lod(param.x->lod()); } diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 599da099d7d6e2cdee38ed64b8b3ece8cf8582c9..d2ae0ceb20d40aac662fd3068be79fd266f9e984 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -244,6 +244,9 @@ struct ScaleParam : ParamBase { float scale{1.}; float bias{}; bool bias_after_scale{true}; + std::string activation_type{""}; + bool fuse_relu{false}; + float alpha{6.}; /////////////////////////////////////////////////////////////////////////////////// // get a vector of input tensors const std::vector* input_tensor_ptrs() override { diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc index d2090076fe387198bbb2db904a73940504ba7841..85e29bef7882113614d15e171ab80b966da4ca50 100644 --- a/lite/operators/scale_op.cc +++ b/lite/operators/scale_op.cc @@ -38,6 +38,20 @@ bool ScaleOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { param_.scale = op_desc.GetAttr("scale"); param_.bias = op_desc.GetAttr("bias"); param_.bias_after_scale = op_desc.GetAttr("bias_after_scale"); + if (op_desc.HasAttr("activation_type")) { + auto act_type = op_desc.GetAttr("activation_type"); + param_.activation_type = act_type; + if (act_type == "relu") { + param_.fuse_relu = true; + } else if (act_type == "relu6") { + param_.alpha = op_desc.GetAttr("alpha"); // 6.f + } else if (act_type == "leaky_relu") { + param_.alpha = op_desc.GetAttr("alpha"); + } else { + CHECK(false) + << "The fused conv only supports fuse with relu and leaky relu"; + } + } CHECK(param_.x); CHECK(param_.output); return true; diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 790d91ad422213e3fab04f7bb6869614822f9149..365646a6e8e954910ab864ed905494746de163ef 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -350,6 +350,7 @@ function make_cuda { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ -DWITH_TESTING=OFF \ -DLITE_WITH_ARM=OFF \ + -DLITE_WITH_STATIC_CUDA=OFF \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_XPU=$BUILD_XPU \ diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl index 6937c334c809dca340a4dbb69a758ad9238b86d3..fa504a6ed19503553be99180fc2a748e3f59643a 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl @@ -13,70 +13,75 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void bilinear_interp(__read_only image2d_t input, __write_only image2d_t output, - __private const float scale_h, __private const float scale_w, - __private const int in_dims_h, __private const int out_dims_h, - __private const int in_dims_w, __private const int out_dims_w, - __private const float align_delta) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); +__kernel void bilinear_interp( + __read_only image2d_t input, __write_only image2d_t output, + __private const float scale_h, __private const float scale_w, + __private const int in_dims_h, __private const int out_dims_h, + __private const int in_dims_w, __private const int out_dims_w, + __private const float align_delta) { + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); - int2 output_pos; - output_pos.x = c * out_dims_w + w; - output_pos.y = nh; + int2 output_pos; + output_pos.x = c * out_dims_w + w; + output_pos.y = nh; - // calculate center pixel's pos - int out_n = nh / out_dims_h; - int out_h = nh % out_dims_h; - float center_w = (w + align_delta) * scale_w - align_delta; - float center_h = (out_h + align_delta) * scale_h - align_delta; + // calculate center pixel's pos + int out_n = nh / out_dims_h; + int out_h = nh % out_dims_h; + float center_w = (w + align_delta) * scale_w - align_delta; + float center_h = (out_h + align_delta) * scale_h - align_delta; - int floor_w = (int)center_w; - int floor_h = (int)center_h; - int ceil_w = floor_w + 1; - int ceil_h = floor_h + 1; + int floor_w = (int)center_w; + int floor_h = (int)center_h; + int ceil_w = floor_w + 1; + int ceil_h = floor_h + 1; - if (ceil_w > in_dims_w) { - ceil_w = floor_w; - } - if (ceil_h > in_dims_h) { - ceil_h = floor_h; - } - float wight0_w = center_w - floor_w; - float wight0_h = center_h - floor_h; - float wight1_w = 1.0 - wight0_w; - float wight1_h = 1.0 - wight0_h; + if (ceil_w > in_dims_w) { + ceil_w = floor_w; + } + if (ceil_h > in_dims_h) { + ceil_h = floor_h; + } + float wight0_w = center_w - floor_w; + float wight0_h = center_h - floor_h; + float wight1_w = 1.0f - wight0_w; + float wight1_h = 1.0f - wight0_h; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - // get left up pixel data - int2 left_up; - left_up.x = c * in_dims_w + floor_w; - left_up.y = out_n * in_dims_h + ceil_h; - half4 left_up_data = read_imageh(input, sampler, left_up); + // get left up pixel data + int2 left_up; + left_up.x = c * in_dims_w + floor_w; + left_up.y = out_n * in_dims_h + ceil_h; + half4 left_up_data = read_imageh(input, sampler, left_up); - // get left down pixel data - int2 left_down; - left_down.x = c * in_dims_w + floor_w; - left_down.y = out_n * in_dims_h + floor_h; - half4 left_down_data = read_imageh(input, sampler, left_down); + // get left down pixel data + int2 left_down; + left_down.x = c * in_dims_w + floor_w; + left_down.y = out_n * in_dims_h + floor_h; + half4 left_down_data = read_imageh(input, sampler, left_down); - // get right up pixel data - int2 right_up; - right_up.x = c * in_dims_w + ceil_w; - right_up.y = out_n * in_dims_h + ceil_h; - half4 right_up_data = read_imageh(input, sampler, right_up); + // get right up pixel data + int2 right_up; + right_up.x = c * in_dims_w + ceil_w; + right_up.y = out_n * in_dims_h + ceil_h; + half4 right_up_data = read_imageh(input, sampler, right_up); - // get right down pixel's data - int2 right_down; - right_down.x = c * in_dims_w + ceil_w; - right_down.y = out_n * in_dims_h + floor_h; - half4 right_down_data = read_imageh(input, sampler, right_down); + // get right down pixel's data + int2 right_down; + right_down.x = c * in_dims_w + ceil_w; + right_down.y = out_n * in_dims_h + floor_h; + half4 right_down_data = read_imageh(input, sampler, right_down); - // calculate output data - half4 data = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h - + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + // calculate output data + half4 data = + (left_down_data * (half)wight1_w + right_down_data * (half)wight0_w) * + (half)wight1_h + + (left_up_data * (half)wight1_w + right_up_data * (half)wight0_w) * + (half)wight0_h; - write_imageh(output, output_pos, data); + write_imageh(output, output_pos, data); } \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp index 439554ec10696913b42923177828870790f0f711..d0f377faee8667a43d3286309e95e8673d9a6a62 100644 --- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp +++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp @@ -30,8 +30,6 @@ bool InstanceNormKernel::Init(InstanceNormParam *param) { build_options = "-DLOCAL_MEM_128"; } else if (h == 64) { build_options = "-DLOCAL_MEM_64"; - } else if (h > 256) { - PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height"); } this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl", build_options); diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp index 270d77c4a051df227719338f6793e64aa2920f9f..bd1d1f87424d48be92777f7e7a72f08b66aa07c7 100644 --- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp @@ -26,13 +26,11 @@ bool InstanceNormReluKernel::Init( FusionInstanceNormReluParam *param) { auto &dims = param->Out()->dims(); const int h = dims[2]; - std::string build_options = "-DRELU"; + std::string build_options = " -DRELU"; if (h == 128) { build_options += " -DLOCAL_MEM_128"; } else if (h == 64) { build_options += " -DLOCAL_MEM_64"; - } else if (h > 256) { - PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height"); } this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl", build_options); diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake index cd84b9cbde2252e2947418c5d6f02ea0097f1527..44f2bc0f088950ede560766a8fd130214200e780 100755 --- a/mobile/tools/op.cmake +++ b/mobile/tools/op.cmake @@ -442,9 +442,9 @@ endif() if (FILL_CONSTANT_OP) add_definitions(-DFILL_CONSTANT_OP) endif() -if (FUSION_CONVADD_OP) - add_definitions(-DFUSION_CONVADD_OP) -endif() +# if (FUSION_CONVADD_OP) +# add_definitions(-DFUSION_CONVADD_OP) +# endif() if (FUSION_CONVADDRELU_OP) add_definitions(-DFUSION_CONVADDRELU_OP) endif()