diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 76d4b925e57f44c7d595995871537997bd962f69..324ee4f5381a20a9a34000045b130d61f71ec116 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "fpga/V2/api.h"
 #include <algorithm>
 #include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
-#include "fpga/V2/config.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index 8f612ec57fb8c425a08dd095d95cdb3a99564658..aac97bec225a4940f710172c115e06452469d289 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "fpga/V2/driver/pe.h"
 #include "framework/tensor.h"
 
-
 namespace paddle_mobile {
 namespace fpga {
 
@@ -36,7 +35,6 @@ enum LayoutType {
   LAYOUT_HWC = 0,
 };
 
-
 struct KernelArgs {
   uint32_t width;
   uint32_t height;
@@ -122,8 +120,6 @@ struct BypassArgs {
   struct ImageOutputArgs output;
 };
 
-
-
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);
diff --git a/src/fpga/V2/config.h b/src/fpga/V2/config.h
index 56429b2848487fe5b29a23c190fa820499b99871..27187c7b854c84d501949db41fe89f9dca1d2bf1 100644
--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
@@ -14,5 +14,5 @@ limitations under the License. */
 
 #pragma once
 
-//#define PADDLE_MOBILE_ZU5
+#define PADDLE_MOBILE_ZU5
 #define FPGA_PRINT_MODE
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp
index 63a0a9ff9423f55e629418d71226f2fba40640ec..52cde04601bc5e002ce2d8e15b3bdb1ce64b340a 100644
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -13,13 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
 #include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
-#include "fpga/V2/config.h"
 
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) x*8
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 1
+
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
 
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
   ComputeBasicConv(args.conv_args[0]);
@@ -129,26 +165,55 @@ int PerformBypass(const struct BypassArgs &args) {
 #ifndef PADDLE_MOBILE_ZU5
   return 0;
 #endif
-  uint64_t bp_enable = 0x8800000000000000;
-  uint64_t *ifm_src_paddr = (uint64_t *)vaddr_to_paddr(args.image.address); // NOLINT
-  uint64_t *ifm_dst_paddr = (uint64_t *)vaddr_to_paddr(args.output.address); // NOLINT
-  uint64_t length = (args.image.channels)*(args.image.width)*(args.image.height);
 
-  bp_enable += length;
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
 
-  reg_writeq(*ifm_src_paddr, MUL8(27));
-  reg_writeq(*ifm_dst_paddr, MUL8(28));
+  // start bypass
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
   reg_writeq(0, MUL8(0));
   reg_writeq(bp_enable, MUL8(0));
-  //poll
+  // poll
   int ret = -1;
-  ret = fpga_regpoll(MUL8(48),1, 0xffffffff);
-  if(ret != -1) {
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
     reg_readq(MUL8(63));
   }
-
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
   return ret;
-
 }
 
 int ComputeFPGAConcat(const struct ConcatArgs &args) {