diff --git a/docs/demo_guides/cuda.md b/docs/demo_guides/cuda.md
index 8b3e76acef590bda19a59388017added6a0b8d52..f863fd86864194c6d022e4cf1fc75eb46725cc2c 100644
--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
 
 4、 `demo` 文件夹：c++ demo.
 
-如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。
 
 ## 运行
 
@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
 
 二： 运行   
 
-**NOTE:**此处示例使用的是python接口。
+**NOTE：** 此处示例使用的是python接口。
 
 ``` python
 #-*- coding: utf-8 -*-
@@ -75,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *
 
 def read_img(im_path, resize_h, resize_w):
   im = cv2.imread(im_path).astype('float32')
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index dfb0a7fa68579e24eac22a7edee89a8cf9e12d5c..b08f2f5c745f87cda2be181bdea2444b2c11313c 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -181,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
   void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
-  int multi_stream() const { return multi_stream_; }
+  bool multi_stream() const { return multi_stream_; }
 #endif
 
 #ifdef LITE_WITH_MLU
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index e81bebe1a31656409ed718b29b956a7a66560248..8cb4dbf192993219347d70bb8ccb704199b45f3d 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -52,6 +52,7 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc
index 5aad98c05c56f85931b7a0276d0a85b426573c4c..aab1058b9dd66522a0793fc151c54707505d1fbb 100644
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
@@ -27,31 +27,467 @@ void scale<float>(
   int remain = num % 16;
   float32x4_t vscale = vdupq_n_f32(scale);
   float32x4_t vbias = vdupq_n_f32(bias);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s          \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s          \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s         \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s         \n"
+
+        "stp  q8, q9, [%[dout]], #32              \n"
+        "subs %w[cnt], %w[cnt],  #1               \n"
+        "stp  q10, q11, [%[dout]], #32            \n"
+
+        "bne    1b                                \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                      @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_relu<float>(
+    const float* din, float* dout, int num, float scale, float bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                      \n"
+        "ld1  {v4.4s}, [%[din]], #16             \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v5.4s}, [%[din]], #16             \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v6.4s}, [%[din]], #16             \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b\n"
+        "ld1  {v7.4s}, [%[din]], #16             \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b\n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fmax v8.4s, v8.4s, %[vzero].4s        \n"
+        "fmax v9.4s, v9.4s, %[vzero].4s        \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s      \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s      \n"
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vmax.f32 q8, q8, %q[vzero]             @ relu \n"
+        "vmax.f32 q9, q9, %q[vzero]             @ relu \n"
+        "vmax.f32 q10, q10, %q[vzero]           @ relu \n"
+        "vmax.f32 q11, q11, %q[vzero]           @ relu \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? *dout : 0.f;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_relu6<float>(const float* din,
+                        float* dout,
+                        int num,
+                        float scale,
+                        float bias,
+                        float alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t valpha = vdupq_n_f32(alpha);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fmax v8.4s, v8.4s, %[vzero].4s        \n"
+        "fmax v9.4s, v9.4s, %[vzero].4s        \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s      \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s      \n"
+
+        "fmin v8.4s, v8.4s, %[valpha].4s       \n"
+        "fmin v9.4s, v9.4s, %[valpha].4s       \n"
+        "fmin v10.4s, v10.4s, %[valpha].4s     \n"
+        "fmin v11.4s, v11.4s, %[valpha].4s     \n"
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vmax.f32 q8, q8, %q[vzero]             @ relu \n"
+        "vmax.f32 q9, q9, %q[vzero]             @ relu \n"
+        "vmax.f32 q10, q10, %q[vzero]           @ relu \n"
+        "vmax.f32 q11, q11, %q[vzero]           @ relu \n"
+
+        "vmin.f32 q8, q8, %q[valpha]             @ relu \n"
+        "vmin.f32 q9, q9, %q[valpha]             @ relu \n"
+        "vmin.f32 q10, q10, %q[valpha]           @ relu \n"
+        "vmin.f32 q11, q11, %q[valpha]           @ relu \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? (*dout < alpha ? *dout : alpha) : 0.f;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_leaky_relu<float>(const float* din,
+                             float* dout,
+                             int num,
+                             float scale,
+                             float bias,
+                             float alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t valpha = vdupq_n_f32(alpha);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fcmge v12.4s, v8.4s, %[vzero].4s       \n"
+        "fmul v16.4s, v8.4s, %[valpha].4s       \n"
+
+        "fcmge v13.4s, v9.4s, %[vzero].4s       \n"
+        "fmul v17.4s, v9.4s, %[valpha].4s        \n"
+
+        "fcmge v14.4s, v10.4s, %[vzero].4s      \n"
+        "fmul v18.4s, v10.4s, %[valpha].4s      \n"
+
+        "fcmge v15.4s, v11.4s, %[vzero].4s      \n"
+        "fmul v19.4s, v11.4s, %[valpha].4s      \n"
+
+        "bif  v8.16b, v16.16b, v12.16b \n"  /* choose*/
+        "bif  v9.16b, v17.16b, v13.16b \n"  /* choose*/
+        "bif  v10.16b, v18.16b, v14.16b \n" /* choose*/
+        "bif  v11.16b, v19.16b, v15.16b \n" /* choose*/
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc",
+          "memory",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vcge.f32 q12, q8, %q[vzero]             @ relu \n"
+        "vmul.f32 q14, q8, %q[valpha]            @ mul \n"
+        "vcge.f32 q13, q9, %q[vzero]             @ relu \n"
+        "vmul.f32 q15, q9, %q[valpha]            @ mul \n"
+        "vbif q8, q14, q12                       @ choose \n"
+        "vbif q9, q15, q13                      @ choose \n"
+
+        "vcge.f32 q12, q10, %q[vzero]             @ relu \n"
+        "vmul.f32 q14, q10, %q[valpha]            @ mul \n"
+        "vcge.f32 q13, q11, %q[vzero]             @ relu \n"
+        "vmul.f32 q15, q11, %q[valpha]            @ mul \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+
+        "vbif q10, q14, q12                       @ choose \n"
+        "vbif q11, q15, q13                      @ choose \n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc",
+          "memory",
+          "q4",
+          "q5",
+          "q6",
+          "q7",
+          "q8",
+          "q9",
+          "q10",
+          "q11",
+          "q12",
+          "q13",
+          "q14",
+          "q15");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? *dout : (*dout * alpha);
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void scale_relu<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
 #pragma omp parallel for
   for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
 
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
 
-    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
 
-    vst1q_f32(dout_ptr, vsum1);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    vst1q_f32(dout_ptr + 8, vsum3);
-    vst1q_f32(dout_ptr + 12, vsum4);
+    vsum1 = vmaxq_s32(vsum1, vzero);
+    vsum2 = vmaxq_s32(vsum2, vzero);
+    vsum3 = vmaxq_s32(vsum3, vzero);
+    vsum4 = vmaxq_s32(vsum4, vzero);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
   }
   if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
     for (int i = 0; i < remain; i++) {
       *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? *dout_ptr : 0;
       dout_ptr++;
       din_ptr++;
     }
@@ -59,11 +495,66 @@ void scale<float>(
 }
 
 template <>
-void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+void scale_relu6<int>(
+    const int* din, int* dout, int num, int scale, int bias, int alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
+  int32x4_t valpha = vdupq_n_s32(alpha);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vsum1 = vmaxq_s32(vsum1, vzero);
+    vsum2 = vmaxq_s32(vsum2, vzero);
+    vsum3 = vmaxq_s32(vsum3, vzero);
+    vsum4 = vmaxq_s32(vsum4, vzero);
+
+    vsum1 = vminq_s32(vsum1, valpha);
+    vsum2 = vminq_s32(vsum2, valpha);
+    vsum3 = vminq_s32(vsum3, valpha);
+    vsum4 = vminq_s32(vsum4, valpha);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? (*dout_ptr > alpha ? alpha : *dout_ptr) : 0;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void scale_leaky_relu<int>(
+    const int* din, int* dout, int num, int scale, int bias, int alpha) {
   int cnt = num >> 4;
   int remain = num % 16;
   int32x4_t vscale = vdupq_n_s32(scale);
   int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
+  int32x4_t valpha = vdupq_n_s32(alpha);
 #pragma omp parallel for
   for (int i = 0; i < cnt; i++) {
     const int* din_ptr = din + (i << 4);
@@ -79,16 +570,33 @@ void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
     int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
     int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
 
+    uint32x4_t v1 = vcgeq_s32(vsum1, vzero);
+    uint32x4_t v2 = vcgeq_s32(vsum2, vzero);
+    uint32x4_t v3 = vcgeq_s32(vsum3, vzero);
+    uint32x4_t v4 = vcgeq_s32(vsum4, vzero);
+
+    int32x4_t v11 = vmulq_s32(vsum1, valpha);
+    int32x4_t v21 = vmulq_s32(vsum1, valpha);
+    int32x4_t v31 = vmulq_s32(vsum1, valpha);
+    int32x4_t v41 = vmulq_s32(vsum1, valpha);
+
+    vsum1 = vbslq_s32(v1, vsum1, v11);
+    vsum2 = vbslq_s32(v2, vsum2, v21);
+    vsum3 = vbslq_s32(v3, vsum3, v31);
+    vsum4 = vbslq_s32(v4, vsum4, v41);
+
     vst1q_s32(dout_ptr, vsum1);
     vst1q_s32(dout_ptr + 4, vsum2);
     vst1q_s32(dout_ptr + 8, vsum3);
     vst1q_s32(dout_ptr + 12, vsum4);
   }
+
   if (remain > 0) {
     const int* din_ptr = din + (cnt << 4);
     int* dout_ptr = dout + (cnt << 4);
     for (int i = 0; i < remain; i++) {
       *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? *dout_ptr : (*dout_ptr) * alpha;
       dout_ptr++;
       din_ptr++;
     }
diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h
index 910bea5613997c05e9257507f8f84792e0071a53..bbdb596bc8f45c247a24f9833680c8a510c1e904 100644
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
 template <typename T>
 void scale(const T* din, T* dout, int num, T scale, T bias);
 
+template <typename T>
+void scale_relu(const T* din, T* dout, int num, T scale, T bias);
+
+template <typename T>
+void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
+template <typename T>
+void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
 template <typename T>
 void scale(const T* din,
            T* dout,
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index a365fe3f7b8f04b3568fbf2c8f85af4e2469706c..b8234b18922f454c41e295209da13de024184adc 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,7 @@ lite_cc_library(mir_passes
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/scale_activation_fuse_pass.cc
       fusion/__xpu__resnet_fuse_pass.cc
       fusion/__xpu__multi_encoder_fuse_pass.cc
       fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index 04a36976c7110c64ef781af12fc86fd4853fe583..a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
 lite_cc_library(fuse_sequence_pool_concat
         SRCS sequence_pool_concat_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scale_activation
+        SRCS scale_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 
 set(mir_fusers
     fuse_fc
@@ -44,6 +47,7 @@ set(mir_fusers
     fuse_transpose_softmax_transpose
     fuse_interpolate
     fuse_sequence_pool_concat
+    fuse_scale_activation
     CACHE INTERNAL "fusers")
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.cc b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ad1f4994f6d5183d3b5c925bb222cb95ea064e8
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
+    fusion::ScaleActivationFuser fuser(act_type);
+    fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
+                  paddle::lite::mir::ScaleActivationFusePass)
+    .BindTargets({TARGET(kARM)})
+    .BindKernel("scale");
diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.h b/lite/core/mir/fusion/scale_activation_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2118a0b6f396ff12855009a975059c95ee6111a8
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ScaleActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/scale_activation_fuser.cc b/lite/core/mir/fusion/scale_activation_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f18099da8bc97d9dab8f9c31fd6c23d42d67d81
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ScaleActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+
+  // create op nodes
+  auto* scale =
+      OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+
+  // create intermediate nodes
+  auto* scale_out = VarNode("scale_out")
+                        ->assert_is_op_output("scale", "Out")
+                        ->assert_is_op_input(act_type_, "X")
+                        ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  *x >> *scale >> *scale_out;
+  *scale_out >> *act >> *out;
+}
+
+void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
+                                         const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+
+  op_desc.SetAttr("activation_type", act_type_);
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("alpha", alpha);
+  } else if (act_type_ == "leaky_relu") {
+    float alpha = act_op_desc.GetAttr<float>("alpha");
+    op_desc.SetAttr("alpha", alpha);
+  }
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/scale_activation_fuser.h b/lite/core/mir/fusion/scale_activation_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fa9b9d2b5ebc5091b41a2ca244689797c97ccb6
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ScaleActivationFuser : public FuseBase {
+ public:
+  explicit ScaleActivationFuser(const std::string& act_type) {
+    act_type_ = act_type;
+  }
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 9a42222e10b930ea805f4d6c048df6eba5509c86..35eba0e5ccc556369e81acfc0b6934726997d9b6 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -86,6 +86,7 @@ class Optimizer {
            "identity_scale_eliminate_pass",               //
            "elementwise_mul_constant_eliminate_pass",     //
            "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
     (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt
index e27548b4e56ce03098c5c82b3eee49add62cc0a4..f057a1f189fdb92ff33f00d5ceacc83f7fc28c5d 100644
--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
@@ -1,20 +1,24 @@
-project(demo CXX C)
 cmake_minimum_required(VERSION 2.8)
+project(demo CXX C)
+
+add_definitions(-DLITE_WITH_CUDA)
 
 set(TARGET demo)
 set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
 
-set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
-set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
+set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
+set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
 
-include_directories("${LITE_LIB}/include")
-link_directories("${LITE_LIB}/lib")
-link_directories("${PROTOBUF_LIB}/lib")
+include_directories("${LITE_ROOT}/include")
+link_directories("${LITE_ROOT}/lib")
+link_directories("${PROTOBUF_ROOT}/lib")
+# cuda lib
+link_directories("/usr/local/cuda/lib64/")
 
 add_executable(${TARGET} ${TARGET}.cc)
 
-set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
+set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
 set(DEPS ${DEPS} protobuf-lite)
-set(DEPS ${DEPS} "-lrt -lpthread -ldl")
+set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")
 
 target_link_libraries(${TARGET} ${DEPS})
diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc
index 71192d7b937116966a5b95a7620805065fdd152e..c6f91f209b42ea6f2f99a7741e90c0eb9103952b 100644
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
@@ -31,7 +31,18 @@ void ScaleCompute<T, PType>::Run() {
   if (!param.bias_after_scale) {
     bias *= scale;
   }
-  lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
+  T alpha = param.alpha;
+  if (param.activation_type == "") {  // no act
+    lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
+  } else if (param.activation_type == "relu") {  // do relu
+    lite::arm::math::scale_relu<T>(x_data, output_data, num, scale, bias);
+  } else if (param.activation_type == "relu6") {  // do relu6
+    lite::arm::math::scale_relu6<T>(
+        x_data, output_data, num, scale, bias, alpha);
+  } else if (param.activation_type == "leaky_relu") {  // do leaky_relu
+    lite::arm::math::scale_leaky_relu<T>(
+        x_data, output_data, num, scale, bias, alpha);
+  }
   if (!param.x->lod().empty()) {
     param.output->set_lod(param.x->lod());
   }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 599da099d7d6e2cdee38ed64b8b3ece8cf8582c9..d2ae0ceb20d40aac662fd3068be79fd266f9e984 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -244,6 +244,9 @@ struct ScaleParam : ParamBase {
   float scale{1.};
   float bias{};
   bool bias_after_scale{true};
+  std::string activation_type{""};
+  bool fuse_relu{false};
+  float alpha{6.};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
   const std::vector<const Tensor*>* input_tensor_ptrs() override {
diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc
index d2090076fe387198bbb2db904a73940504ba7841..85e29bef7882113614d15e171ab80b966da4ca50 100644
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
@@ -38,6 +38,20 @@ bool ScaleOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   param_.scale = op_desc.GetAttr<float>("scale");
   param_.bias = op_desc.GetAttr<float>("bias");
   param_.bias_after_scale = op_desc.GetAttr<bool>("bias_after_scale");
+  if (op_desc.HasAttr("activation_type")) {
+    auto act_type = op_desc.GetAttr<std::string>("activation_type");
+    param_.activation_type = act_type;
+    if (act_type == "relu") {
+      param_.fuse_relu = true;
+    } else if (act_type == "relu6") {
+      param_.alpha = op_desc.GetAttr<float>("alpha");  // 6.f
+    } else if (act_type == "leaky_relu") {
+      param_.alpha = op_desc.GetAttr<float>("alpha");
+    } else {
+      CHECK(false)
+          << "The fused conv only supports fuse with relu and leaky relu";
+    }
+  }
   CHECK(param_.x);
   CHECK(param_.output);
   return true;
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 790d91ad422213e3fab04f7bb6869614822f9149..365646a6e8e954910ab864ed905494746de163ef 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -350,6 +350,7 @@ function make_cuda {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
+            -DLITE_WITH_STATIC_CUDA=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
             -DLITE_WITH_XPU=$BUILD_XPU \
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
index 6937c334c809dca340a4dbb69a758ad9238b86d3..fa504a6ed19503553be99180fc2a748e3f59643a 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
@@ -13,70 +13,75 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void bilinear_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
-                             __private const int in_dims_w, __private const int out_dims_w,
-                             __private const float align_delta) {
-    const int c = get_global_id(0);
-    const int w = get_global_id(1);
-    const int nh = get_global_id(2);
+__kernel void bilinear_interp(
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const float scale_h, __private const float scale_w,
+    __private const int in_dims_h, __private const int out_dims_h,
+    __private const int in_dims_w, __private const int out_dims_w,
+    __private const float align_delta) {
+  const int c = get_global_id(0);
+  const int w = get_global_id(1);
+  const int nh = get_global_id(2);
 
-    int2 output_pos;
-    output_pos.x = c * out_dims_w + w;
-    output_pos.y = nh;
+  int2 output_pos;
+  output_pos.x = c * out_dims_w + w;
+  output_pos.y = nh;
 
-    // calculate center pixel's pos
-    int out_n = nh / out_dims_h;
-    int out_h = nh % out_dims_h;
-    float center_w = (w + align_delta)  * scale_w - align_delta;
-    float center_h = (out_h + align_delta) * scale_h - align_delta;
+  // calculate center pixel's pos
+  int out_n = nh / out_dims_h;
+  int out_h = nh % out_dims_h;
+  float center_w = (w + align_delta) * scale_w - align_delta;
+  float center_h = (out_h + align_delta) * scale_h - align_delta;
 
-    int floor_w = (int)center_w;
-    int floor_h = (int)center_h;
-    int ceil_w = floor_w + 1;
-    int ceil_h = floor_h + 1;
+  int floor_w = (int)center_w;
+  int floor_h = (int)center_h;
+  int ceil_w = floor_w + 1;
+  int ceil_h = floor_h + 1;
 
-    if (ceil_w > in_dims_w) {
-        ceil_w = floor_w;
-    }
-    if (ceil_h > in_dims_h) {
-        ceil_h = floor_h;
-    }
-    float wight0_w = center_w - floor_w;
-    float wight0_h = center_h - floor_h;
-    float wight1_w = 1.0 - wight0_w;
-    float wight1_h = 1.0 - wight0_h;
+  if (ceil_w > in_dims_w) {
+    ceil_w = floor_w;
+  }
+  if (ceil_h > in_dims_h) {
+    ceil_h = floor_h;
+  }
+  float wight0_w = center_w - floor_w;
+  float wight0_h = center_h - floor_h;
+  float wight1_w = 1.0f - wight0_w;
+  float wight1_h = 1.0f - wight0_h;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    // get left up pixel data
-    int2 left_up;
-    left_up.x = c * in_dims_w + floor_w;
-    left_up.y = out_n * in_dims_h + ceil_h;
-    half4 left_up_data = read_imageh(input, sampler, left_up);
+  // get left up pixel data
+  int2 left_up;
+  left_up.x = c * in_dims_w + floor_w;
+  left_up.y = out_n * in_dims_h + ceil_h;
+  half4 left_up_data = read_imageh(input, sampler, left_up);
 
-    // get left down pixel data
-    int2 left_down;
-    left_down.x = c * in_dims_w + floor_w;
-    left_down.y = out_n * in_dims_h + floor_h;
-    half4 left_down_data = read_imageh(input, sampler, left_down);
+  // get left down pixel data
+  int2 left_down;
+  left_down.x = c * in_dims_w + floor_w;
+  left_down.y = out_n * in_dims_h + floor_h;
+  half4 left_down_data = read_imageh(input, sampler, left_down);
 
-    // get right up pixel data
-    int2 right_up;
-    right_up.x = c * in_dims_w + ceil_w;
-    right_up.y = out_n * in_dims_h + ceil_h;
-    half4 right_up_data = read_imageh(input, sampler, right_up);
+  // get right up pixel data
+  int2 right_up;
+  right_up.x = c * in_dims_w + ceil_w;
+  right_up.y = out_n * in_dims_h + ceil_h;
+  half4 right_up_data = read_imageh(input, sampler, right_up);
 
-    // get right down pixel's data
-    int2 right_down;
-    right_down.x = c * in_dims_w + ceil_w;
-    right_down.y = out_n * in_dims_h + floor_h;
-    half4 right_down_data = read_imageh(input, sampler, right_down);
+  // get right down pixel's data
+  int2 right_down;
+  right_down.x = c * in_dims_w + ceil_w;
+  right_down.y = out_n * in_dims_h + floor_h;
+  half4 right_down_data = read_imageh(input, sampler, right_down);
 
-    // calculate output data
-    half4 data = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
-            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
+  // calculate output data
+  half4 data =
+      (left_down_data * (half)wight1_w + right_down_data * (half)wight0_w) *
+          (half)wight1_h +
+      (left_up_data * (half)wight1_w + right_up_data * (half)wight0_w) *
+          (half)wight0_h;
 
-    write_imageh(output, output_pos, data);
+  write_imageh(output, output_pos, data);
 }
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
index 439554ec10696913b42923177828870790f0f711..d0f377faee8667a43d3286309e95e8673d9a6a62 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -30,8 +30,6 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
     build_options = "-DLOCAL_MEM_128";
   } else if (h == 64) {
     build_options = "-DLOCAL_MEM_64";
-  } else if (h > 256) {
-    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
   }
   this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
                              build_options);
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
index 270d77c4a051df227719338f6793e64aa2920f9f..bd1d1f87424d48be92777f7e7a72f08b66aa07c7 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -26,13 +26,11 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
     FusionInstanceNormReluParam<GPU_CL> *param) {
   auto &dims = param->Out()->dims();
   const int h = dims[2];
-  std::string build_options = "-DRELU";
+  std::string build_options = " -DRELU";
   if (h == 128) {
     build_options += " -DLOCAL_MEM_128";
   } else if (h == 64) {
     build_options += " -DLOCAL_MEM_64";
-  } else if (h > 256) {
-    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
   }
   this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
                              build_options);
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
index cd84b9cbde2252e2947418c5d6f02ea0097f1527..44f2bc0f088950ede560766a8fd130214200e780 100755
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -442,9 +442,9 @@ endif()
 if (FILL_CONSTANT_OP)
   add_definitions(-DFILL_CONSTANT_OP)
 endif()
-if (FUSION_CONVADD_OP)
-  add_definitions(-DFUSION_CONVADD_OP)
-endif()
+# if (FUSION_CONVADD_OP)
+#   add_definitions(-DFUSION_CONVADD_OP)
+# endif()
 if (FUSION_CONVADDRELU_OP)
   add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()