diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7475fed772b96fb373c7fe03c6a6fb650181849c..f5d68712a64b5a47657a7af9c0e6b47604893e23 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,7 +213,7 @@ else()
     set(NET "default" CACHE STRING "select net type")
 endif()
 
-set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
 
diff --git a/README.md b/README.md
index 2572f25444dc4268e7a6a3f43cfdc1b38dae8e02..c3a30009825a7f8f9f5c4940a847fc88fe6a840e 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,8 @@
 
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
-* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
 * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
 * [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
 
diff --git a/doc/development_android.md b/doc/development_android.md
index 528d7aa2def78103b8dbdcf0329279f029c85cac..d0da36aa82cfc9d4826f03bd2bdf1dd8f551965a 100644
--- a/doc/development_android.md
+++ b/doc/development_android.md
@@ -1,10 +1,93 @@
 # Android开发文档
 
-用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
+用户可通过如下两种方式进行编译:
 
+- 基于macOS 、Linux交叉编译
 - 基于Docker容器编译
-- 基于Linux交叉编译
 
+## 基于macOS 、Linux交叉编译
+
+需要: NDK17及以上、cmake 3.0及以上
+
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+
+```
+
+cd tools
+sh build.sh android
+
+# 如果想编译只支持某些特定网络的库 (可以控制包体积, 编译出来的库就只包含了支持这些特定模型的算子), 可以使用
+
+sh build.sh android  mobilenet googlenet
+
+# 当然这些网络是需要在 cmakelist  中配置的(https://github.com/PaddlePaddle/paddle-mobile/blob/73769e7d05ef4820a115ad3fb9b1ca3f55179d03/CMakeLists.txt#L216), 目前配置了几个常见模型
+
+```
+
+执行完毕后，生成的so位于 build/release/ 目录中
+
+jni 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni)
+
+c++ 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h) 
+
+单测可执行文件位于 test/build 目录中。
+
+如果有环境问题, 可以看接下来的环节
+
+### 环境配置
+
+##### 下载Android NDK
+
+如果你的电脑安装了Android Studio, 可以在 Android Studio 中直接下载安装 NDK
+
+或者可以在 [https://developer.android.com/ndk/](https://developer.android.com/ndk/) 这里自行下载，也可以通过以下命令获取：
+
+- Mac平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
+unzip android-ndk-r17b-darwin-x86_64.zip
+
+```
+- Linux平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
+unzip android-ndk-r17b-linux-x86_64.zip
+```
+
+##### 设置环境变量
+工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
+
+```
+export NDK_ROOT = "path to ndk"
+```
+
+##### 安装 CMake
+
+- Mac平台
+
+mac 平台下可以使用 homebrew 安装
+
+```
+brew install cmake
+
+```
+- Linux平台
+
+linux 下可以使用 apt-get 进行安装
+```
+apt-get install cmake
+
+```
+
+##### Tips:
+如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
+如执行如下命令：
+
+```
+sh build.sh android googlenet
+```
+会得到一个支持googlnet的体积更小的库。
 
 ## 基于Docker容器编译
 ### 1. 安装 docker
@@ -67,115 +150,38 @@ root@5affd29d4fc5:/ # make
 ### 6. 查看构建产出
 构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
 
-## 基于Linux交叉编译
-### 交叉编译环境准备
-##### 下载Android NDK
+## 测试
+在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备
 
-从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/)，用户可以自行前往下载，也可以通过以下命令获取：
-- Mac平台
-```
-wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
-unzip android-ndk-r17b-darwin-x86_64.zip
+* 下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip)
 
-```
-- Linux平台
-```
-wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
-unzip android-ndk-r17b-linux-x86_64.zip
-```
-
-##### 设置环境变量
-工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
-
-```
-export NDK_ROOT = "path to ndk"
-```
-### 执行编译
-在paddle-mobile根目录中，执行以下命令：
+* 创建模型和图片文件夹
 
 ```
-cd tools
-sh build.sh android
-
+cd test
+mkdir models
+mkdir images
 ```
-执行完毕后，生成的so位于build目录中，单测可执行文件位于test/build目录中。
-##### Tips:
-如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
-如执行如下命令：
 
-```
-sh build.sh android googlenet
-```
-会得到一个支持googlnet的体积更小的库。
+* 将mobilenet复制到paddle-mobile/test/models目录下 将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下
 
-##测试
-在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备中，执行以下命令：
+* 执行下面命令将可执行文件和预测需要的文件部署到手机
 
 ```
 cd tools/android-debug-script
-sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
+sh push2android.sh
 ```
-出现如下提示：
 
-```
-**** choose OP or NET to test ****
-which to test :
-```
-输入名称即可运行对应的测试文件。
+* mobilenet cpu模型预测结果
 
-##部署
-Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：
+假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上，执行下面命令进行mobilenet cpu的预测
 
-##### 1 load接口  加载模型参数
-- 用于加载参数文件分散的模型
 ```
-/**
-     * Load seperated parameters
-     * @param modelDir
-     * @return
-     */
-    public static native boolean load(String modelDir);
+adb shell
+cd /data/local/tmp/bin/
+export LD_LIBRARY_PATH=.
+./test-mobilenet
 ```
-- 用于加载参数文件合并的模型文件
-```
-/**
-     * Load combined parameters
-     * @param modelPath
-     * @param paramPath
-     * @return
-     */
-    public static native boolean loadCombined(String modelPath,String paramPath);
 
-```
-##### 2 predict接口 执行预测
-- 接受预处理过的RGB数组的predict接口
-```
-/**
-*@param buf 输入数据
-*@return 输出数据
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf);
-```
-- 接受原始yuv数据的predict接口
-```
- /**
-     *
-     * @param buf yuv420格式的字节数组
-     * @param imgWidth yuv数据的宽
-     * @param imgHeight yuv数据的高
-     * @param ddims 输入数据的形状
-     * @param meanValues 模型训练时各通道的均值
-     * @return
-     */
-
-    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
-
-```
-##### 3 clear接口 销毁实例、清理内存操作
-
-```
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
-                                                        jclass thiz);
-```
 
 
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 8f284b3fe1115bd8cec78430a405289aae98e898..ba00f639d76ae7c928f5b7484c08cec0b0926904 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -71,10 +71,10 @@ const char *G_OP_TYPE_SUM = "sum";
 
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
-extern const char *G_OP_TYPE_TANH = "tanh";
-extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
+const char *G_OP_TYPE_TANH = "tanh";
+const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
+const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
+const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 04e51ab9b09fabc41fcd1cd73864bc285d183821..7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -13,251 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V1/api.h"
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <algorithm>
-#include <map>
 #include "fpga/V1/bias_scale.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"
-#define FPGA_TEST_MODE
-#define PADDLE_MOBILE_OS_LINUX
 
 namespace paddle_mobile {
 namespace fpga {
 
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static inline int do_ioctl(int req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
-  int result = ioctl(fd, req, (uint64_t)arg);
-  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
-  return result;
-#else
-  return -1;
-#endif
-}
-
-int open_device() {
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  return fd;
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-
-#ifdef PADDLE_MOBILE_OS_LINUX
-  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_OS_LINUX
-    munmap(ptr, size);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args = {nullptr};
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-half fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-           (((tmp & 0x7f800000) >> 13) - (112 << 10));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(half fp16_num) {
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ComputeBasicConv(args.conv_args[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-}
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
-}
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_TEST_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}
-
 int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
 
 void format_image(framework::Tensor *image_tensor) {
@@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args =
+  arg->conv_arg =
       (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
@@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
       filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
 
   for (int i = 0; i < n; i++) {
-    arg->conv_args[i].relu_enabled = relu_enabled;
-    arg->conv_args[i].group_num = (uint32_t)group_num;
-    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_address = &(
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_address = &(
         (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_args[i].filter_num = (uint32_t)(
+    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].filter_num = (uint32_t)(
         i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                    : filter_num_per_div);
 
     if (n > 1) {
-      arg->conv_args[i].output.scale_address =
+      arg->conv_arg[i].output.scale_address =
           (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_args[i].output.address = fpga_malloc(
-          input->dims()[2] *
-          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
+      arg->conv_arg[i].output.address =
+          fpga_malloc(input->dims()[2] *
+                      align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
+                                 IMAGE_ALIGNMENT) *
+                      sizeof(half));
     } else {
-      arg->conv_args[i].output.scale_address = out->scale;
-      arg->conv_args[i].output.address = out_ptr;
+      arg->conv_arg[i].output.scale_address = out->scale;
+      arg->conv_arg[i].output.address = out_ptr;
     }
 
     arg->concat_arg.images_in[i] =
-        (half *)arg->conv_args[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
+        (half *)arg->conv_arg[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
   }
 }
 
diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h
index f535975a35ecc3c454bbac597b31d8c3670cbf91..daa7902ab4a6cb72a77bba31f8cfe84c897f30a4 100644
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -14,178 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-enum DataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  half kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-#define IOCTL_FPGA_MAGIC 'FPGA'
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-
-//============================== API =============================
-
-int open_device();
-int close_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dst, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
-
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-
 int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
@@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
-half fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(half fp16_num);
-
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp
index 3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a..263a7494c5602c13208aa0d8899ce80d781aa11b 100644
--- a/src/fpga/V1/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp
index 3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c..157ac90a60262cadacb648173cbc5ba6c01e674e 100644
--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) {
 }
 
 int calc_division_number(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
   int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
   return group_num * split_num;
 }
 
 int calc_num_per_div(int num, int group_num, int division_capacity) {
-  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-                        "Filter number should be divisible by group number");
+  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+  //                        "Filter number should be divisible by group
+  //                        number");
   int split_num = calc_split_num(num, division_capacity);
-  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-                        "Split number or group number should be 1");
+  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+  //                        "Split number or group number should be 1");
   if (group_num == 1) {
     if (num > division_capacity) {
       return division_capacity;
diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp
index 73be05c942d6a848db830148d25bc8b3e14b53e4..f4142ad58a273691c84db9dd585518e7edcff8a6 100644
--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V1/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -111,6 +111,27 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
   fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
 }
 
+void split_image(int16_t *image_in, float *scale_in, void **images_out,
+                 float **scales_out, int image_num, uint32_t *channel_nums,
+                 int height, int width) {
+  int total_channel = 0;
+  for (int i = 0; i < image_num; i++) {
+    scales_out[i][0] = scale_in[0];
+    scales_out[i][1] = scale_in[1];
+    total_channel += channel_nums[i];
+  }
+
+  for (int h = 0; h < height; h++) {
+    int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT);
+    for (int i = 0; i < image_num; i++) {
+      int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT);
+      memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
+             channel_nums[i] * sizeof(int16_t));
+      src_offset += channel_nums[i];
+    }
+  }
+}
+
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h
index 7e004916118ae97d60d24e798300d66a98191211..321967bbe233c5bec889aeb63f98dc23779b4918 100644
--- a/src/fpga/V1/image.h
+++ b/src/fpga/V1/image.h
@@ -28,6 +28,9 @@ void concat_images(int16_t** images_in, float** scales_in, void* image_out,
                    float* scale_out, int image_num, uint32_t* channel_num,
                    int height,
                    int width);  // Concat featuremaps along channel direction
+void split_image(int16_t* image_in, float* scale_in, void** images_out,
+                 float** scales_out, int image_num, uint32_t* channel_nums,
+                 int height, int width);
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f0e5768a7017a4c7f928fea86b8f4ef3cdbae3d
--- /dev/null
+++ b/src/fpga/V1/pe.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/common/pe.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_arg[0]);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width);
+  return 0;
+}
+
+int ComputeFPGASplit(const struct SplitArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaSplit===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   in_address:" << args.image_in
+       << "   in_scale_address:" << args.scale_in;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.out_channel_nums[i]
+         << "   image_address:" << args.images_out[i]
+         << "   image_scale_address:" << args.scales_out[i];
+  }
+#endif
+  image::split_image(args.image_in, args.scale_in, args.images_out,
+                     args.scales_out, args.image_num, args.out_channel_nums,
+                     args.height, args.width);
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index 2f8a9f119e643b3836ef2c541e098f39ab3cbd17..5bfd34104600668ce63a9c7d684d4482d5d804fb 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -13,84 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/api.h"
-#include <algorithm>
 #include "fpga/V2/bias_scale.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-static std::map<void *, size_t> memory_map;
-
-int open_device() {
-  int ret = driver::open_device_driver();
-  return ret;
-}
-
-int close_device() {
-  int ret = driver::close_device_driver();
-  return ret;
-}
-
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-#ifdef PADDLE_MOBILE_ZU5
-  auto ptr = driver::fpga_malloc_driver(size);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  static uint64_t counter = 0;
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_ZU5
-    driver::fpga_free_driver(ptr);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Invalid pointer";
-  }
-}
-void fpga_copy(void *dest, const void *src, size_t num) {
-#ifdef PADDLE_MOBILE_ZU5
-  driver::fpga_copy_driver(dest, src, num);
-#else
-  memcpy(dest, src, num);
-#endif
-}
-
-int fpga_flush(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_flush_driver(address, size);
-#else
-  return 0;
-#endif
-}
-int fpga_invalidate(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_invalidate_driver(address, size);
-#else
-  return 0;
-#endif
-}
-
 void format_image(framework::Tensor *image_tensor) {
   auto dims = image_tensor->dims();
   auto channel = dims[1], height = dims[2], width = dims[3];
@@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
     arg->conv_arg[i].output.address = out_ptr;
     arg->conv_arg[i].output.scale_address = out->scale;
 
-    int num_after_alignment =
-        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    int num_after_alignment = filter::calc_aligned_num(
+        (int)input->dims()[1], arg->filter_num);  // NOLINT
     arg->conv_arg[i].free_space =
         fpga_malloc(num_after_alignment * 2 * sizeof(half));
   }
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index 1f4a203936b517d93e2d417b08a8b8456cc1fc93..1386810164d72ef849162b76a8b83fcf32082907 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -14,21 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"
+#include "fpga/common/pe.h"
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace fpga {
 
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dest, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
 float filter_find_max(framework::Tensor* filter_tensor);
 int get_aligned_channel_num(int channel_num);
 int get_aligned_filter_num(framework::Tensor* filter_tensor);
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
index 3afd3f51bbb10e3bb2d66195fcc54d25c56e2393..c8f587da330c6e6e9e35969d58ae27f4366830d2 100644
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "fpga/V2/bias_scale.h"
 #include <memory.h>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
index ce278edbeed64f2ca413c1f75ff620ee1f44c83d..b17ce4406bf1b6b4619d0e9e75d3f432dfa84fb1 100644
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -73,7 +73,7 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
 
 void align_filter(float **data_in, int num, int channel, int height,
                   int width) {
-  int aligned_channel = calc_channel_parallelism(channel);
+  int aligned_channel = calc_aligned_channel(channel);
   int hw = height * width;
   int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
   float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp
deleted file mode 100644
index 01bca30a9ccf79232e1f28bbf77b1c030632f5bc..0000000000000000000000000000000000000000
--- a/src/fpga/V2/fpga_common.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fpga/V2/fpga_common.h>
-namespace paddle_mobile {
-namespace fpga {
-
-int16_t fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  if (0 == fp16_num) {
-    return 0;
-  }
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 26829bfba65f2375b27251070b33b2bbe57d069b..3d1ed95df2a805c8c64f9184e0a720f5449d6181 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "fpga/V2/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/V2/api.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/pe.cpp
similarity index 79%
rename from src/fpga/V2/driver/pe.cpp
rename to src/fpga/V2/pe.cpp
index 2e806bfb37c131fad1c011c960bc79aa1b121186..35ef06de2d7b3adf5d8d71722319f66026635eaa 100644
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/pe.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/V2/driver/pe.h"
-#include "fpga/V2/config.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/pe.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -166,54 +166,54 @@ int PerformBypass(const struct BypassArgs &args) {
   return 0;
 #endif
 
-  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
-  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
-  uint64_t bp_enable;
-  int64_t length;
-  uint64_t pixels;
-
-  // fp32->fp16
-  if ((args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(float);
-    bp_enable = 0x8800000000000000 + length;
-  }
-  // fp16->fp32
-  else if ((!args.input_data_type) && (args.output_data_type)) {
-    pixels = filter::calc_aligned_channel((args.image.channels)) *
-             (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    length = align_to_x((int)length, 64);  // NOLINT
-    bp_enable = 0x8a00000000000000 + length;
-  }
-  // fp16->fp16 findmax
-  else if ((!args.input_data_type) && (!args.output_data_type)) {
-    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
-    length = pixels * sizeof(short);
-    bp_enable = 0x8900000000000000 + length;
-  } else {
-    return -1;
-  }
-
-  // start bypass
-  driver::reg_writeq(ifm_src_paddr, MUL8(27));
-  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
-  driver::reg_writeq(0, MUL8(0));
-  driver::reg_writeq(bp_enable, MUL8(0));
-  // poll
-  int ret = -1;
-  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
-  if (ret != -1) {
-    // clear "irq"
-    driver::reg_readq(MUL8(63));
-  }
-  // get max value
-  if ((!args.input_data_type) && (!args.output_data_type)) {
-    float scale = Findfp16Max();
-    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
-    args.output.scale_address[1] = scale;
-  }
-  return ret;
+  //  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  //  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
+  //  uint64_t bp_enable;
+  //  int64_t length;
+  //  uint64_t pixels;
+  //
+  //  // fp32->fp16
+  //  if ((args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(float); bp_enable =
+  //    0x8800000000000000 + length;
+  //  }
+  //  // fp16->fp32
+  //  else if ((!args.input_data_type) && (args.output_data_type)) {
+  //    pixels = filter::calc_aligned_channel((args.image.channels)) *
+  //             (args.image.width) * (args.image.height);
+  //    length = pixels * sizeof(short);
+  //    length = align_to_x((int)length, 64);  // NOLINT
+  //    bp_enable = 0x8a00000000000000 + length;
+  //  }
+  //  // fp16->fp16 findmax
+  //  else if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    pixels = (args.image.channels) * (args.image.width) *
+  //    (args.image.height); length = pixels * sizeof(short); bp_enable =
+  //    0x8900000000000000 + length;
+  //  } else {
+  //    return -1;
+  //  }
+  //
+  //  // start bypass
+  //  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  //  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  //  driver::reg_writeq(0, MUL8(0));
+  //  driver::reg_writeq(bp_enable, MUL8(0));
+  //  // poll
+  //  int ret = -1;
+  //  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  //  if (ret != -1) {
+  //    // clear "irq"
+  //    driver::reg_readq(MUL8(63));
+  //  }
+  //  // get max value
+  //  if ((!args.input_data_type) && (!args.output_data_type)) {
+  //    float scale = Findfp16Max();
+  //    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+  //    args.output.scale_address[1] = scale;
+  //  }
+  // return ret;
 }
 
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/common/bitmap.cpp
similarity index 99%
rename from src/fpga/V2/driver/bitmap.cpp
rename to src/fpga/common/bitmap.cpp
index c612faa6aed11b683ff81fffdf6c57a6fed9536d..9742a4559927b0520b32eeabc757f5a0f4e3392a 100644
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/common/bitmap.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/V2/driver/bitmap.h"
+#include "fpga/common/bitmap.h"
 
 namespace fpga_bitmap {
 void bitmap_set(uint64_t *map, unsigned int start, int len) {
diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/common/bitmap.h
similarity index 100%
rename from src/fpga/V2/driver/bitmap.h
rename to src/fpga/common/bitmap.h
diff --git a/src/fpga/V2/config.h b/src/fpga/common/config.h
similarity index 100%
rename from src/fpga/V2/config.h
rename to src/fpga/common/config.h
diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/common/driver.cpp
similarity index 96%
rename from src/fpga/V2/driver/driver.cpp
rename to src/fpga/common/driver.cpp
index d7e71782676fd350f938847c03e9736ff0adb64a..8c59ac14fb11282b29a837152194d873bd65d87d 100644
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -28,8 +28,8 @@ limitations under the License. */
 #include <iostream>
 
 #include "common/enforce.h"
-#include "fpga/V2/driver/bitmap.h"
-#include "fpga/V2/driver/driver.h"
+#include "fpga/common/bitmap.h"
+#include "fpga/common/driver.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) {
   }
 }
 
-static inline int do_ioctl(unsigned long req, const void *arg) {
+static inline int do_ioctl(int64_t req, const void *arg) {
   return ioctl(g_fpgainfo.fd_mem, req, arg);
 }
 
@@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) {
 
   p_addr = vaddr_to_paddr(address);
 
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
   args.size = size;
 
   return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
@@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
 
   p_addr = vaddr_to_paddr(address);
 
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
   args.size = size;
 
   return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
@@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
   for (i = 0; i < num; i++) {
     // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
     // usleep(1);
-    *((int8_t *)dest + i) = *((int8_t *)src + i);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
   }
 
   return;
diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/common/driver.h
similarity index 90%
rename from src/fpga/V2/driver/driver.h
rename to src/fpga/common/driver.h
index 633e95ea8204ada2a330a6bb4fab4ce8fe23248b..2dad07ec5206a7ca64449aa38ebe0603d72b71e3 100644
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/common/driver.h
@@ -33,8 +33,6 @@ namespace driver {
 #define FPGA_MEM_PHY_ADDR 0x20000000
 #define FPGA_MEM_SIZE 0x20000000
 
-#define CPU_FREQ 1000000000
-
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
 
 // PE related macros
@@ -53,7 +51,7 @@ struct MemoryCacheArgs {
   size_t size;
 };
 
-#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_FPGA_MAGIC 'F'
 #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
 #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
 
@@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo;
 
 inline uint64_t reg_readq(uint32_t offset) {
   // DLOG << "offset : " << offset;
-  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                                          offset);  // NOLINT
+  uint64_t value =
+      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                             offset);                                // NOLINT
 
   return value;
 }
 
 inline void reg_writeq(uint64_t value, uint32_t offset) {
   // DLOG << "offset : " << offset << ", value : " << value;
-  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
-                         offset) =  // NOLINT
-      value;
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
+                         offset) = value;
 }
 
 int open_device_driver();
diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7ae838fa4216d121cf38a11ef4897043b9a0dd
--- /dev/null
+++ b/src/fpga/common/fpga_common.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/common/fpga_common.h"
+#include <algorithm>
+#include <map>
+#include "fpga/common/config.h"
+#include "fpga/common/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = driver::open_device_driver();
+  return ret;
+}
+
+int close_device() {
+  int ret = driver::close_device_driver();
+  return ret;
+}
+
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = driver::fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    driver::fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
+}
+
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/common/fpga_common.h
similarity index 84%
rename from src/fpga/V2/fpga_common.h
rename to src/fpga/common/fpga_common.h
index 1862d843503ee8faf58caf038202e198ca079905..b3f619f2f24aba47d99f7f427c4b67af8c0d430d 100644
--- a/src/fpga/V2/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 
 namespace paddle_mobile {
@@ -73,8 +74,19 @@ struct ConcatArgs {
   void* image_out;
   float* scale_out;
   uint32_t* channel_num;
-  uint32_t* aligned_channel_num;
-  uint32_t out_channel;
+  //  uint32_t* aligned_channel_num;
+  //  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitArgs {
+  uint32_t image_num;
+  int16_t* image_in;
+  float* scale_in;
+  void** images_out;
+  float** scales_out;
+  uint32_t* out_channel_nums;
   uint32_t height;
   uint32_t width;
 };
@@ -117,9 +129,19 @@ struct BypassArgs {
 struct DeconvArgs {
   struct ConvArgs conv_arg;
 };
+
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 
+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/common/pe.h
similarity index 91%
rename from src/fpga/V2/driver/pe.h
rename to src/fpga/common/pe.h
index 4903bf4c33f6b5d5899c56eeaada8c7a21d1a875..ae773f25b4171df3e552aaa07bb05af8564d872a 100644
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/common/pe.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include "fpga/V2/fpga_common.h"
+#include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -25,6 +25,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 
 int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
+int ComputeFPGASplit(const struct SplitArgs& args);
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h
index bea91ee24ceb5e9011708bd277629a07beb4b8ef..8a79d9bad766ba3ef4d3fc1eeae7813ecd60887b 100644
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -61,9 +61,16 @@ class CLHelper {
       auto work_size_2 = n * h;
       return {work_size_0, work_size_1, work_size_2};
     } else if (image_dim.size() == 2) {
+      auto h = image_dim[0];
+      auto w = image_dim[1];
       return {1, image.ImageWidth(), image.ImageHeight()};
     } else if (image_dim.size() == 1) {
       return {1, image.ImageWidth(), 1};
+    } else if (image_dim.size() == 3) {
+      int c = image_dim[0];
+      int h = image_dim[1];
+      int w = image_dim[2];
+      return {(c + 3) / 4, w, h};
     }
     PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
   }
diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp
index f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5..d6cc52d69cef1249fe9ed62a344d069af0305bc0 100644
--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
@@ -13,18 +13,98 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/cl/cl_image.h"
+#include "framework/cl/cl_tensor.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
-                     cl_command_queue commandQueue) {
-  // TODO(yangfei): need imp
+void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
+                     cl_command_queue commandQueue, cl_kernel kernel) {
+  tensor->mutable_data<float>();
+  const auto &dim = cl_image->dims();
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < dim.size(); ++j) {
+    new_dims[4 - dim.size() + j] = dim[j];
+  }
+  size_t C, in_height, in_width;
+
+  C = new_dims[1];
+  in_height = new_dims[2];
+  in_width = new_dims[3];
+
+  CLTensor out_cl_tensor(context, commandQueue);
+  out_cl_tensor.Resize(tensor->dims());
+  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
+
+  auto input_image = cl_image->GetCLImage();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * C;
+  clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  clSetKernelArg(kernel, 7, sizeof(int), &C);
+  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
+                                new_dims[0] * new_dims[2]};
+  clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, global_work_size, NULL,
+                         0, NULL, NULL);
+  memcpy(tensor->data<float>(), out_cl_tensor.Data<float>(),
+         tensor->memory_size());
 }
 
-void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
-                     cl_command_queue commandQueue) {
-  // TODO(yangfei): need imp
+void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context,
+                     cl_command_queue commandQueue, cl_kernel kernel) {
+  const auto &dim = cl_image->dims();
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < dim.size(); ++j) {
+    new_dims[4 - dim.size() + j] = dim[j];
+  }
+  cl_int status;
+  auto output = cl_image;
+  const Tensor *input = tensor;
+  const float *input_data = input->data<float>();
+  auto output_image = output->GetCLImage();
+  const int out_C = new_dims[1];
+  const int out_H = new_dims[2];
+  const int out_W = new_dims[3];
+  const int Stride2 = out_C * out_H * out_W;
+  const int Stride1 = out_H * out_W;
+  const int Stride0 = out_W;
+  DLOG << out_C;
+  DLOG << out_H;
+  DLOG << out_W;
+  CLTensor input_cl_tensor(context, commandQueue);
+  input_cl_tensor.Resize(input->dims());
+  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
+                                new_dims[0] * new_dims[2]};
+  status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
+                                  global_work_size, NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
 }
 
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h
index 35f60d3b773937d381447b23b64985ce543fddee..f94eba187f2c5610d7a20098e95015244b420ce2 100644
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -120,17 +120,19 @@ class CLImage {
     PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                           " empty image tensor data shouldn't have value");
 
-    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    //    CLImageConverterFolder *folder_converter = new
+    //    CLImageConverterFolder();
+    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
 
     DLOG << " to get image dims ";
-    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    image_dims_ = normal_converter->InitImageDimInfoWith(dim);
     DLOG << " end get image dims " << image_dims_;
 
     InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
 
     tensor_dims_ = dim;
     command_queue_ = command_queue;
-    image_converter_ = folder_converter;
+    image_converter_ = normal_converter;
     cl_event_ = CLEngine::Instance()->CreateEvent(context);
     initialized_ = true;
     DLOG << " end init cl image";
@@ -220,11 +222,11 @@ class CLImage {
   CLImageConverterBase *image_converter_ = nullptr;
 };
 
-void TensorToCLImage(Tensor *tensor, CLImage *image,
-                     cl_command_queue commandQueue);
+void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context,
+                     cl_command_queue commandQueue, cl_kernel kernel);
 
-void CLImageToTensor(CLImage *image, Tensor *tensor,
-                     cl_command_queue commandQueue);
+void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context,
+                     cl_command_queue commandQueue, cl_kernel kernel);
 
 #ifdef PADDLE_MOBILE_DEBUG
 Print &operator<<(Print &printer, const CLImage &image);
diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp
index 13094a8d05ac6f7f8d2451a3498da058b37ee98b..1e63bd05671a5c47ee324554ae84a3f4f71e6020 100644
--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
@@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
   }
 }
 
+const DDim &CLImageConverterNormal::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+
+  return make_ddim({width, height});
+}
+
+void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+
+void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h
index 02887b0cd468a45630122bb3f236c0775ac1eaa1..ad5994f852f3b47c6a645b16cfd253a6529e117e 100644
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
@@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase {
   int height_of_one_block_;
 };
 
+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
 class CLImageConverterNWBlock : public CLImageConverterBase {
   const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
   void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 6a773da00ff6541d55a6a9d04ca470fed5de81a1..4b50f15a868e3bdbb8434af0cc0d49a6cb54c6a5 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -143,10 +143,12 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
   int t1 = 1;
   int t2 = 1;
   for (int i = 0; i < m * k; ++i) {
-    a[i] = t1 + rand() % t2;
+    unsigned int seed = 100;
+    a[i] = t1 + rand_r(&seed) % t2;
   }
   for (int i = 0; i < k * n; ++i) {
-    b[i] = t1 + rand() % t2;
+    unsigned int seed = 200;
+    b[i] = t1 + rand_r(&seed) % t2;
   }
   paddle_mobile::operators::math::Gemm gemm;
   auto time1 = paddle_mobile::time();
@@ -215,13 +217,13 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
   cl_int status;
   cl_uint nPlatform;
   clGetPlatformIDs(0, NULL, &nPlatform);
-  cl_platform_id *listPlatform =
-      (cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id));
+  cl_platform_id *listPlatform = reinterpret_cast<cl_platform_id *>(
+      malloc(nPlatform * sizeof(cl_platform_id)));
   clGetPlatformIDs(nPlatform, listPlatform, NULL);
   cl_uint nDevice = 0;
   clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
   cl_device_id *listDevice =
-      (cl_device_id *)malloc(nDevice * sizeof(cl_device_id));
+      reinterpret_cast<cl_device_id *>(malloc(nDevice * sizeof(cl_device_id)));
   clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice,
                  NULL);
   cl_context context =
@@ -277,41 +279,66 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
   clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
   cl_kernel kernel = clCreateKernel(program, "feed", &status);
 
+  int out_H = 224;
+  int out_W = 224;
+  int out_C = 3;
+  int Stride2 = out_C * out_H * out_W;
+  int Stride1 = out_H * out_W;
+  int Stride0 = out_W;
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
   CL_CHECK_ERRORS(status);
 
-  size_t global_work_size[2] = {input_w, input_h};
+  size_t global_work_size[3] = {1, 224, 224};
 
   //  cl_event out_event = param.Out()->GetClEvent();
 
-  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size,
+  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
                                   NULL, 0, NULL, NULL);
   CL_CHECK_ERRORS(status);
 
+  out_H = 3;
+  out_W = 3;
+  out_C = 3;
+  Stride2 = out_C * out_H * out_W;
+  Stride1 = out_H * out_W;
+  Stride0 = out_W;
+
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
   CL_CHECK_ERRORS(status);
 
-  size_t global_work_size1[2] = {filter_w, filter_h};
+  size_t global_work_size1[3] = {1, 3, 96};
 
   //  cl_event out_event = param.Out()->GetClEvent();
 
-  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1,
+  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1,
                                   NULL, 0, NULL, NULL);
   CL_CHECK_ERRORS(status);
 
@@ -378,13 +405,16 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
   auto time2 = paddle_mobile::time();
   paddle_mobile::memory::Free(input);
   paddle_mobile::memory::Free(filter);
-  return paddle_mobile::time_diff(time1, time2);
+  if (status == CL_SUCCESS) {
+    return paddle_mobile::time_diff(time1, time2);
+  } else {
+    return -1;
+  }
 }
 template <typename Dtype, Precision P>
 int PaddleMobile<Dtype, P>::readText(
     const char *kernelPath,
-    char **pcode)  // 读取文本文件放入 pcode，返回字符串长度
-{
+    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
   FILE *fp;
   int size;
   // printf("<readText> File: %s\n", kernelPath);
@@ -402,7 +432,7 @@ int PaddleMobile<Dtype, P>::readText(
     return -1;
   }
   rewind(fp);
-  if ((*pcode = (char *)malloc(size + 1)) == NULL) {
+  if ((*pcode = reinterpret_cast<char *>(malloc(size + 1))) == NULL) {
     printf("<readText> Allocate space failed\n");
     return -1;
   }
diff --git a/src/operators/kernel/cl/box_coder_kernel.cpp b/src/operators/kernel/cl/box_coder_kernel.cpp
index 582f6131bf60f02fb67f0479622123a28e89f713..b98435f9b09864d8ced90955c6fd3167c770bf31 100644
--- a/src/operators/kernel/cl/box_coder_kernel.cpp
+++ b/src/operators/kernel/cl/box_coder_kernel.cpp
@@ -20,13 +20,57 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL> *param) {
+bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL>* param) {
+  if (param->CodeType() == "decode_center_size") {
+    this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl");
+  }
   return true;
 }
 
 template <>
 void BoxCoderKernel<GPU_CL, float>::Compute(
-    const BoxCoderParam<GPU_CL> &param) {}
+    const BoxCoderParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox());
+  const auto* input_priorbox = param.InputPriorBox();
+  const auto* input_priorboxvar = param.InputPriorBoxVar();
+  const auto* input_targetbox = param.InputTargetBox();
+  const auto& code_type = param.CodeType();
+  if (code_type == "decode_center_size") {
+    auto prior_box_image = input_priorbox->GetCLImage();
+    auto prior_box_var_image = input_priorboxvar->GetCLImage();
+    auto target_box_image = input_targetbox->GetCLImage();
+    auto output_image = param.OutputBox()->GetCLImage();
+    auto& outputDim = param.OutputBox()->dims();
+    int new_dims[4] = {1, 1, 1, 1};
+    for (int i = 0; i < outputDim.size(); i++) {
+      new_dims[4 - outputDim.size() + i] = outputDim[i];
+    }
+    int out_C = new_dims[1];
+    int out_H = new_dims[2];
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    size_t global_work_size[2] = {default_work_size[0], default_work_size[2]};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl b/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..60000c994ecbe421d1c951f14077d764d3665d4d
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
@@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void box_decoder(__read_only image2d_t prior_box_image,
+                      __read_only image2d_t prior_box_var_image,
+                      __read_only image2d_t target_box_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H
+                      ){
+                       const int out_c = get_global_id(0);
+                       const int out_nh = get_global_id(1);
+                       const int out_h = out_nh%out_H;
+                       const int out_n =  1;
+
+                       const int prior_box_n = 1;
+                       const int prior_box_c = 0;
+                       const int prior_box_h = out_h;
+
+
+                       const int prior_box_var_n = 1;
+                       const int prior_box_var_c = 0;
+                       const int prior_box_var_h = out_h;
+
+                       const int target_box_n = 1;
+                       const int target_box_c = out_c;
+                       const int target_box_h = out_h;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                       int2  prior_box_pos;
+                       int2  prior_box_var_pos;
+                       int2  target_box_pos;
+                       int2  output_pos;
+
+                       prior_box_pos.x = prior_box_c * 4;
+                       prior_box_pos.y = prior_box_n * prior_box_h;
+
+                       prior_box_var_pos.x = prior_box_var_c * 4;
+                       prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+
+                       target_box_pos.x = target_box_c * 4;
+                       target_box_pos.y = target_box_n * target_box_h;
+
+                       output_pos.x = out_c * 4;
+                       output_pos.y = out_n * out_h;
+
+                       half4 prior_box_input[4];
+                       half4 prior_box_var_input[4];
+                       half4 target_box_input[4];
+
+                       prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y));
+                       prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y));
+                       prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y));
+                       prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y));
+
+                       prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y));
+                       prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y));
+                       prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y));
+                       prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y));
+
+
+
+                       target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y));
+                       target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y));
+                       target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y));
+                       target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y));
+
+                       half prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                       half prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                       half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2;
+                       half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2;
+
+                       half4 target_box_center_x;
+                       half4 target_box_center_y;
+                       half4 target_box_width;
+                       half4 target_box_height;
+                       half4 output[4];
+
+                       output[0] = 0.0f;
+                       output[1] = 0.0f;
+                       output[2] = 0.0f;
+                       output[3] = 0.0f;
+
+                       target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                       target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                       target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                       target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+
+                       output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                       output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                       output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                       output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+
+                       if(out_C - out_c * 4 >= 2){
+                       target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                       target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                       target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                       target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                       output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                       output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                       output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                       output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+
+                       }
+                       if(out_C - out_c * 4 >= 3){
+                       target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                       target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                       target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                       target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                       output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                       output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                       output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                       output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                       }
+                       if(out_C - out_c * 4 >= 4){
+                       target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                       target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                       target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                       target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                       output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                       output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                       output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                       output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                       }
+
+
+                       write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                       write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                       write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                       write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+
+}
\ No newline at end of file
diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b07ee4d819b25ef77729ed868c54b19a3d8699ae
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+/*
+
+__kernel void concatByC(__read_only image2d_t input_image1,
+                      __read_only image2d_t input_image2,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int out_C_Start,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int int_C1,
+                      __private const int int_C2) {
+
+                       const int in_c = get_global_id(0);
+                       const int in_w = get_global_id(1);
+                       const int in_nh = get_global_id(2);
+                       int out_c1 = (out_C_Start)/4 + in_c;
+
+                       int out_c2 = out_c1 + 1;
+
+                       int2 output_pos1;
+                       int2 output_pos2;
+
+                       output_pos1.x = out_c1 * out_W + in_w;
+                       output_pos1.y = in_nh;
+
+                       output_pos2.x = out_c2 * out_W + in_w;
+                       output_pos2.y = in_nh;
+
+                       int2 input_pos1;
+                       if(in_c==0){
+                        input_pos1.x = ((in_C1-1)/4) * in_W + in_w;
+                       }else{
+                        input_pos1.x = (in_c - 1) * in_W + in_w;
+                       }
+
+                       input_pos1.y = in_nh;
+
+                       int2 input_pos2;
+                       input_pos2.x = in_c * in_W + in_w;
+                       input_pos2.y = in_nh;
+
+                       half4 output1;
+                       half4 output2;
+                       half4 input1;
+                       half4 input2;
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                   CLK_ADDRESS_CLAMP |
+                                                   CLK_FILTER_NEAREST;
+                       if(in_c==0){
+                       input1 = read_imageh(input_image1, sampler,input_pos1);
+
+                       }else {
+                       input1 = read_imageh(input_image2, sampler,input_pos1);
+                       }
+                       input2 = read_imageh(input_image2, sampler,input_pos2);
+                       output1 = input1;
+
+                       if(out_C_Start%4==0){
+                        output2 = input2;
+
+                       }else if(out_C_Start%4==1){
+                        output1.y = input2.x;
+                        output1.z = input2.y;
+                        output1.w = input2.z;
+                        output2.x = input2.w;
+                        output2.y = 0.0f;
+                        output2.z = 0.0f;
+                        output2.w = 0.0f;
+
+                       }else if(out_C_Start%4==2){
+                       output1.z = input2.x;
+                       output1.w = input2.y;
+                       output2.x = input2.z;
+                       output2.y = input2.w;
+                       output2.z = 0.0f;
+                       output2.w = 0.0f;
+
+                       }else if(out_C_Start%4==3){
+                       output1.w = input2.x;
+                       output2.x = input2.y;
+                       output2.y = input2.z;
+                       output2.z = input2.w;
+                       output2.w = 0.0f;
+                       }
+                       write_imageh(output_image, output_pos1, output1);
+                       write_imageh(output_image, output_pos2, output2);
+}
+
+__kernel void concatByW0(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W) {
+
+                      const int in_c = get_global_id(0);
+                      const int in_w = get_global_id(1);
+                      const int in_nh = get_global_id(2);
+
+                      int2 input_pos = in_c * out_W + in_w;
+
+                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                      half4 input;
+                      input = read_imageh(input_image, sampler,input_pos);
+
+                      write_imageh(output_image, input_pos, input);
+
+}
+*/
+
+__kernel void concatByH(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W,
+                      __private const int out_H_Start) {
+
+                      const int in_c = get_global_id(0);
+                      const int in_w = get_global_id(1);
+                      const int in_nh = get_global_id(2);
+
+                      int2 input_pos;
+                      input_pos.x = in_c * out_W + in_w;
+                      input_pos.y = in_nh;
+
+                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                      half4 input;
+                      input = read_imageh(input_image, sampler,input_pos);
+
+                      int2 output_pos;
+                      output_pos.x = input_pos.x;
+                      output_pos.y = out_H_Start + input_pos.y;
+
+                      write_imageh(output_image, output_pos, input);
+
+}
+
+
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index 63e6e62345c8034ef914b4c385e6fd976b267c4c..2247df59fb77a67a87a00bd26de014f94e86a378 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
             int2 pos_of_weight;
             pos_of_weight.x = i * 3 + j % 3;
             pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            float4 weight_x = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
             output.x += dot(input[j], weight_x);
 
             pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            float4 weight_y = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
             output.y += dot(input[j], weight_y);
 
             pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            float4 weight_z = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
             output.z += dot(input[j], weight_z);
 
             pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            float4 weight_w = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
             output.w += dot(input[j], weight_w);
         }
 */
diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
index 200a221c9bda49c42f2caff374fc24d6e4df27e5..bb661f3cf7102d5ef35b57f2167face0957129bc 100644
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
@@ -13,26 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c)
- {
-        int i = get_global_id(0);
-        int j = get_global_id(1);
-        half4 pixel;
-        pixel.x = convert_half(in[(i * w + j)]);
-        if(c>=2){
-        pixel.y = convert_half(in[h * w + (i * w + j)]);
-        }else{
-        pixel.y = 0.0;
-        }
-        if(c>=3){
-        pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
-        }else{
-         pixel.z = 0.0;
-        }
-        pixel.w = 0.0;
-        int2 coords;
-        coords.x = j;
-        coords.y = i;
-
-        write_imageh(outputImage,coords,pixel);
+__kernel void feed(__global float *in,
+                   __write_only image2d_t output_image,
+                   __private const int out_H,
+                   __private const int out_W,
+                   __private const int out_C,
+                   __private const int Stride0,
+                   __private const int Stride1,
+                   __private const int Stride2){
+
+            const int out_c = get_global_id(0);
+            const int out_w = get_global_id(1);
+            const int out_nh = get_global_id(2);
+            const int out_n = out_nh/out_H;
+            const int out_h = out_nh%out_H;
+
+            const int in_n = out_n;
+            const int in_c0 = out_c * 4 + 0;
+            const int in_c1 = out_c * 4 + 1;
+            const int in_c2 = out_c * 4 + 2;
+            const int in_c3 = out_c * 4 + 3;
+            const int in_h = out_h;
+            const int in_w = out_w;
+
+
+            int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
+            int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
+            int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
+            int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
+
+            int2 output_pos;
+            output_pos.x = out_c * out_W + out_w;
+            output_pos.y = out_nh;
+
+            half4 output = (half4)0.0f;
+            output.x = convert_half(in[input_pos0]);
+            if(out_C - 4 * out_c>=2){
+             output.y = convert_half(in[input_pos1]);
+            }
+            if(out_C - 4 * out_c>=3){
+            output.z = convert_half(in[input_pos2]);
+            }
+            if(out_C - 4 * out_c>=4){
+             output.w = convert_half(in[input_pos3]);
+            }
+            write_imageh(output_image, output_pos, output);
+
  }
diff --git a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
index 053c7b3f06249f426c4a2203ba9c89362ded6a08..886f62df687361fa40f9987659d0fe31e575de6a 100644
--- a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
@@ -19,47 +19,52 @@ __kernel void prior_box(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __global float *box_width,
                         __global float *box_height,
-                        __write_only image2d_t output_image,
+                        __global float *variances_Buffer,
+                        __write_only image2d_t output_boxes,
+                        __write_only image2d_t output_variances,
                         __private const float step_width,
                         __private const float step_height,
                         __private const float offset,
                         __private const int img_width,
                         __private const int img_height,
                         __private const int num_priors,
-                        __private const int C){
-
+                        __private const int C,
+                        __private const int clip){
 
                         const int out_c = get_global_id(0);
                         const int out_nh = get_global_id(1);
                         const int out_n = out_nh/num_priors;
                         const int out_h = out_nh%num_priors;
 
-                        if (out_c >= global_size_dim0 ||out_nh >= global_size_dim2) {
-                             return;
-                         }
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                  CLK_ADDRESS_CLAMP          |
-                                                  CLK_FILTER_NEAREST;
                         int2 output_pos;
                         output_pos.x = out_c * 4;
                         output_pos.y = out_nh;
-                        float center_x0 = (offset + out_c * 4) * step_width;
-                        float center_x1 = (offset + out_c * 4 + 1) * step_width;
-                        float center_x2 = (offset + out_c * 4 + 2) * step_width;
-                        float center_x3 = (offset + out_c * 4 + 3) * step_width;
-                        float center_y = (out_n + offset) * step_height;
+                        float center_x0 = (offset + (float)(out_c * 4)) * step_width;
+                        float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width;
+                        float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width;
+                        float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width;
+                        float center_y = ((float)out_n + offset) * step_height;
 
                         half4 output[4];
-                        output[0].x = convert_half((center_x0 - box_width[out_h]) / img_width);
-                        output[1].x = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].x = convert_half((center_x0 + box_width[out_h]) / img_width);
-                        output[3].x = convert_half((center_y + box_height[out_h]) / img_height);
+                        half4 variances[4];
+                        output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
+                        output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
+                        output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].x = convert_half(variances_Buffer[0]);
+                        variances[1].x = convert_half(variances_Buffer[1]);
+                        variances[2].x = convert_half(variances_Buffer[2]);
+                        variances[3].x = convert_half(variances_Buffer[3]);
 
                         if(C - 4 * out_c>=2){
-                        output[0].y = convert_half((center_x1 - box_width[out_h]) / img_width);
-                        output[1].y = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].y = convert_half((center_x1 + box_width[out_h]) / img_width);
-                        output[3].y = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
+                        output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
+                        output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].y = convert_half(variances_Buffer[0]);
+                        variances[1].y = convert_half(variances_Buffer[1]);
+                        variances[2].y = convert_half(variances_Buffer[2]);
+                        variances[3].y = convert_half(variances_Buffer[3]);
                         }else{
                          output[0].y = 0.0f;
                          output[1].y = 0.0f;
@@ -67,10 +72,14 @@ __kernel void prior_box(__private const int global_size_dim0,
                          output[3].y = 0.0f;
                         }
                         if(C - 4 * out_c>=3){
-                        output[0].z = convert_half((center_x2 - box_width[out_h]) / img_width);
-                        output[1].z = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].z = convert_half((center_x2 + box_width[out_h]) / img_width);
-                        output[3].z = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width);
+                        output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
+                        output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].z = convert_half(variances_Buffer[0]);
+                        variances[1].z = convert_half(variances_Buffer[1]);
+                        variances[2].z = convert_half(variances_Buffer[2]);
+                        variances[3].z = convert_half(variances_Buffer[3]);
                         }else{
                         output[0].z = 0.0f;
                         output[1].z = 0.0f;
@@ -78,23 +87,43 @@ __kernel void prior_box(__private const int global_size_dim0,
                         output[3].z = 0.0f;
                         }
                         if(C - 4 * out_c>=4){
-                        output[0].w = convert_half((center_x3 - box_width[out_h]) / img_width);
-                        output[1].w = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].w = convert_half((center_x3 + box_width[out_h]) / img_width);
-                        output[3].w = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width);
+                        output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
+                        output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].w = convert_half(variances_Buffer[0]);
+                        variances[1].w = convert_half(variances_Buffer[1]);
+                        variances[2].w = convert_half(variances_Buffer[2]);
+                        variances[3].w = convert_half(variances_Buffer[3]);
                         }else{
-                        output[0].z = 0.0f;
-                        output[1].z = 0.0f;
-                        output[2].z = 0.0f;
-                        output[3].z = 0.0f;
+                        output[0].w = 0.0f;
+                        output[1].w = 0.0f;
+                        output[2].w = 0.0f;
+                        output[3].w = 0.0f;
+                        }
+                        if(clip==1){
+                         output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
                         }
-                        output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[0]);
-                        write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[1]);
-                        write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[2]);
-                        write_imageh(output_image, (int2)(output_pos.x + 4, output_pos.y), output[3]);
+                        /*
+                        if(output_pos.x == 0 && output_pos.y == 1){
+                          float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
+                          printf("output = %v4hlf \n", out);
+
+                        }
+                        */
+
+                        write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+
+                        write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
+
 
 }
\ No newline at end of file
diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl
index 0ffc64f15cd531879de4852f976769790b6bafe4..7957001c9659e94e706888b658ebf39640f61a0a 100644
--- a/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
@@ -14,26 +14,150 @@ limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-__kernel void reshape(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3,
-                      __private const int x0,
-                      __private const int x1,
-                      __private const int x2,
-                      __private const int x3) {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+                       const int out_c = get_global_id(0);
+                       const int out_w = get_global_id(1);
+                       const int out_nh = get_global_id(2);
+                       const int out_n = out_nh/out_H;
+                       const int out_h = out_nh%out_H;
+                       const int out_c0 = out_c * 4;
+                       const int out_c1 = out_c * 4 + 1;
+                       const int out_c2 = out_c * 4+ 2;
+                       const int out_c3 = out_c * 4+ 3;
+
+                       int count0 =  out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count1 =  out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count2 =  out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count3 =  out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+                       int in_n0 = count0/in_Stride2;
+                       int in_n1 = count1/in_Stride2;
+                       int in_n2 = count1/in_Stride2;
+                       int in_n3 = count2/in_Stride2;
+
+                       count0 = count0%in_Stride2;
+                       count1 = count1%in_Stride2;
+                       count2 = count2%in_Stride2;
+                       count3 = count3%in_Stride2;
+
+                       int in_c0 = count0/in_Stride1;
+                       int in_c1 = count1/in_Stride1;
+                       int in_c2 = count2/in_Stride1;
+                       int in_c3 = count3/in_Stride1;
+
+                       int in_h0 = (count0%in_Stride1)/in_Stride0;
+                       int in_h1 = (count1%in_Stride1)/in_Stride0;
+                       int in_h2 = (count2%in_Stride1)/in_Stride0;
+                       int in_h3 = (count3%in_Stride1)/in_Stride0;
+
+                       int in_w0 = (count0%in_Stride1)%in_Stride0;
+                       int in_w1 = (count1%in_Stride1)%in_Stride0;
+                       int in_w2 = (count2%in_Stride1)%in_Stride0;
+                       int in_w3 = (count3%in_Stride1)%in_Stride0;
+
+
+                       int2 input_pos0;
+                       int2 input_pos1;
+                       int2 input_pos2;
+                       int2 input_pos3;
+
+                       input_pos0.x = (in_c0/4) * in_W + in_w0;
+                       input_pos0.y = in_n0 * in_H + in_h0;
+
+                       input_pos1.x = (in_c1/4) * in_W + in_w1;
+                       input_pos1.y = in_n1 * in_H + in_h1;
+
+                       input_pos2.x = (in_c2/4) * in_W + in_w2;
+                       input_pos2.y = in_n2 * in_H + in_h2;
+
+                       input_pos3.x = (in_c3/4) * in_W + in_w3;
+                       input_pos3.y = in_n3 * in_H + in_h3;
+
+                       int2 output_pos;
+                       output_pos.x = out_c * out_W + out_w;
+                       output_pos.y = out_nh;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+
+                       half4 input0;
+                       half4 input1;
+                       half4 input2;
+                       half4 input3;
+                       half4 output;
+
+                       input0 = read_imageh(input_image, sampler,input_pos0);
+                       if(in_c0%4==0){
+                          output.x = input0.x;
+                       }else if(in_c0%4==1){
+                          output.x = input0.y;
+                       }else if(in_c0%4==2){
+                          output.x = input0.z;
+                       }else{
+                          output.x = input0.w;
+                       }
+                       if(out_C - out_c * 4>=2){
+                          input1 = read_imageh(input_image, sampler,input_pos1);
+                       if(in_c1%4==0){
+                          output.y = input1.x;
+                       }else if(in_c1%4==1){
+                          output.y = input1.y;
+                       }else if(in_c1%4==2){
+                          output.y = input1.z;
+                       }else{
+                          output.y = input1.w;
+                       }
+
+                       }else{
+                          output.y = 0.0f;
+                       }
+
+                       if(out_C - out_c * 4>=3){
+                          input2 = read_imageh(input_image, sampler,input_pos2);
+
+                       if(in_c2%4==0){
+                          output.z = input2.x;
+                       }else if(in_c2%4==1){
+                          output.z = input1.y;
+                       }else if(in_c2%4==2){
+                          output.z = input2.z;
+                       }else{
+                          output.z = input2.w;
+                       }
+                       }else{
+                          output.z = 0.0f;
+                       }
 
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
+                       if(out_C - out_c * 4>=4){
+                          input3 = read_imageh(input_image, sampler,input_pos3);
+                       if(in_c3%4==0){
+                          output.w = input3.x;
+                       }else if(in_c3%4==1){
+                          output.w = input3.y;
+                       }else if(in_c3%4==2){
+                          output.w = input3.z;
+                       }else{
+                          output.w = input3.w;
+                       }
+                       }else{
+                          output.w = 0.0f;
+                       }
 
-  write_imageh(output, (int2)(x, y), in);
+                       write_imageh(output_image, output_pos, output);
 }
 
 
diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl
index 215ec69fc283dcb2b538300cb5591b2b9e4b6a13..a1fa014e00d021f6ad39ac49a841b54476b63639 100644
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
@@ -16,35 +16,46 @@ limitations under the License. */
 
 __kernel void softmax(__read_only image2d_t input_image,
                       __write_only image2d_t output_image,
-                      __private const int group
+                      __private const int out_W
                       ) {
     const int out_c = get_global_id(0);   //  block index
     const int out_w = get_global_id(1);   // index in one block
     const int out_nh = get_global_id(2);
 
+    const int in_c = out_c;
+    const int in_w = out_w;
+    const int in_nh = out_nh;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+    int2 input_pos;
+    int2 output_pos;
 
-  half maxv = 0.0f;
-  for (int i = 0; i < group; ++i) {
-    half4 temp = read_imageh(input_image, sampler, (int2)(i, 0));
-    maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
-  }
+    input_pos.x = in_c * out_W + in_w;
+    input_pos.y = in_nh;
 
+    output_pos.x = out_c * out_W + out_w;
+    output_pos.y = out_nh;
 
-  half4 rsum = (half4)(0.0f);
-  for (int i = 0; i < group; ++i) {
-    half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
-    rsum += convert_half4(exp(convert_float4(r - maxv)));
-  }
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                CLK_ADDRESS_CLAMP |
+                                CLK_FILTER_NEAREST;
+
+    half4 input_max = 0.0f;
+    half4 input_tmp;
+    for(int i=0;i<out_W;i++){
+     input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
+     input_max = max(input_max,input_tmp);
+    }
+
+    half4 sum = (half4)0.0f;
+    for(int i=0;i<out_W;i++){
+        input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
+        sum += exp(input_tmp - input_max);
+       }
 
-  float sum = rsum.x + rsum.y + rsum.z + rsum.w;
+       half4 input = read_imageh(input_image, sampler,input_pos);
+       half4 output = exp(input - input_max)/sum;
+       write_imageh(output_image, output_pos, output);
 
-  half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
-  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
-  write_imageh(output_image, (int2)(out_w, out_nh), result);
 }
 
 /*
diff --git a/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f09cf0141eceee22e972cc545082e0dc7e0a1a49
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void transpose_4d( __read_only image2d_t input_image,
+                            __write_only image2d_t output_image,
+                            __private const int out_C,
+                            __private const int out_H,
+                            __private const int out_W,
+                            __private const int in_W
+                           ){
+                           const int out_c = get_global_id(0);
+                           const int out_w = get_global_id(1);
+                           const int out_nh = get_global_id(2);
+                           const int out_n =  1;
+                           const int out_h = out_nh%out_H;
+                           const int out_c0 = out_c * 4;
+                           const int out_c1 = out_c * 4 + 1;
+                           const int out_c2 = out_c * 4+ 2;
+                           const int out_c3 = out_c * 4+ 3;
+
+                           const int in_n = out_n;
+                           const int in_c = out_w / 4;
+                           const int in_h0 = out_c0;
+                           const int in_h1 = out_c1;
+                           const int in_h2 = out_c2;
+                           const int in_h3 = out_c3;
+                           const int in_w = out_h;
+
+                           int2 output_pos;
+                           output_pos.x = out_c * out_W + out_w;
+                           output_pos.y = out_nh;
+
+                           int2 input_pos0;
+                           int2 input_pos1;
+                           int2 input_pos2;
+                           int2 input_pos3;
+
+                           input_pos0.x = in_W * in_c + in_w;
+                           input_pos0.y = in_n * in_h0;
+
+                           input_pos1.x = in_W * in_c + in_w;
+                           input_pos1.y = in_n * in_h1;
+
+                           input_pos2.x = in_W * in_c + in_w;
+                           input_pos2.y = in_n * in_h2;
+
+                           input_pos3.x = in_W * in_c + in_w;
+                           input_pos3.y = in_n * in_h3;
+
+                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                         CLK_ADDRESS_CLAMP      |
+                                                         CLK_FILTER_NEAREST;
+
+                           half4 input0;
+                           half4 input1;
+                           half4 input2;
+                           half4 input3;
+                           half4 output;
+                           input0 = read_imageh(input_image, sampler,input_pos0);
+
+                            if(out_w%4==0){
+                            output.x = input0.x;
+                           }else if(out_w%4==1){
+                            output.x = input0.y;
+                           }else if(out_w%4==2){
+                            output.x = input0.z;
+                           }else{
+                            output.x = input0.w;
+                           }
+                           if(out_C - out_c * 4>=2){
+                            input1 = read_imageh(input_image, sampler,input_pos1);
+                            if(out_w%4==0){
+                            output.y = input1.x;
+                            }else if(out_w%4==1){
+                             output.y = input1.y;
+                            }else if(out_w%4==2){
+                             output.y = input1.z;
+                            }else{
+                             output.y = input1.w;
+                            }
+
+                           }else{
+                           output.y = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=3){
+                            input2 = read_imageh(input_image, sampler,input_pos2);
+
+                          if(out_w%4==0){
+                            output.z = input2.x;
+                            }else if(out_w%4==1){
+                             output.z = input2.y;
+                            }else if(out_w%4==2){
+                             output.z = input2.z;
+                            }else{
+                             output.z = input2.w;
+                            }
+                           }else{
+                            output.z = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=4){
+                            input3 = read_imageh(input_image, sampler,input_pos3);
+                           if(out_w%4==0){
+                            output.w = input3.x;
+                            }else if(out_w%4==1){
+                             output.w = input3.y;
+                            }else if(out_w%4==2){
+                             output.w = input3.z;
+                            }else{
+                             output.w = input3.w;
+                            }
+                           }else{
+                           output.w = 0.0f;
+                           }
+                           write_imageh(output_image, output_pos, output);
+}
+
+__kernel void transpose( __read_only image2d_t input_image,
+                            __write_only image2d_t output_image,
+                            __private const int out_C,
+                            __private const int out_H,
+                            __private const int out_W,
+                            __private const int in_W
+                           ){
+                           const int out_c = get_global_id(0);
+                           const int out_w = get_global_id(1);
+                           const int out_nh = get_global_id(2);
+                           const int out_n =  1;
+                           const int out_h = out_nh%out_H;
+
+                           const int in_n = 1;
+                           const int in_c = out_c;
+                           const int in_w = out_h;
+                           const int in_h = out_w;
+
+                           int2 input_pos;
+                           int2 output_pos;
+
+                           input_pos.x = in_c * in_W + in_w;
+                           input_pos.y = in_n * in_h;
+
+                           output_pos.x = out_c * out_W + out_w;
+                           output_pos.y = out_n * out_h;
+
+                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                      CLK_ADDRESS_CLAMP      |
+                                                      CLK_FILTER_NEAREST;
+
+                           half4 input;
+                           half4 output;
+                           input = read_imageh(input_image, sampler,input_pos);
+
+                           output = input;
+                           write_imageh(output_image, output_pos, output);
+
+}
\ No newline at end of file
diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp
index 48fbd03315957f54873e5ee18bc95896ca306554..3deb31e7aa0c408cc2b87c523d324001f75ade88 100644
--- a/src/operators/kernel/cl/concat_kernel.cpp
+++ b/src/operators/kernel/cl/concat_kernel.cpp
@@ -21,11 +21,49 @@ namespace operators {
 
 template <>
 bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
+  if (param->Out()->dims().size() < 4) {
+    this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
+  }
   return true;
 }
 
 template <>
-void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {}
+void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
+  if (param.Out()->dims().size() < 4) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto inputs = param.Inputs();
+    auto *output_image = param.Out()->GetCLImage();
+    int out_W = 0;
+    if (param.Out()->dims().size() == 3) {
+      out_W = param.Out()->dims()[2];
+    } else if (param.Out()->dims().size() == 2) {
+      out_W = param.Out()->dims()[1];
+    }
+    int out_H_Start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      auto input_image = inputs[i]->GetCLImage();
+      auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
+      cl_int status;
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start);
+      CL_CHECK_ERRORS(status);
+      status = clEnqueueNDRangeKernel(
+          this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+      if (param.Out()->dims().size() == 3) {
+        out_H_Start += inputs[i]->dims()[1];
+      } else if (param.Out()->dims().size() == 2) {
+        out_H_Start += inputs[i]->dims()[0];
+      }
+    }
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
index 941a6cb815541d1eca30ccc193161838ce28da80..e813d08e6b11fbfe330a39532561f062d175bfdb 100644
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -27,6 +27,7 @@ bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
 template <>
 void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
   cl_int status;
   param.Out()->InitEmptyImage(cl_helper_.CLContext(),
                               cl_helper_.CLCommandQueue(), param.Out()->dims());
@@ -35,10 +36,13 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   //  DLOG << *input;
   const float *input_data = input->data<float>();
   int numel = input->numel();
-  cl_mem cl_image = output->GetCLImage();
-  int c = input->dims()[1];
-  int height = output->dims()[2];
-  int width = output->dims()[3];
+  cl_mem output_image = output->GetCLImage();
+  const int out_C = output->dims()[1];
+  const int out_H = output->dims()[2];
+  const int out_W = output->dims()[3];
+  const int Stride2 = out_C * out_H * out_W;
+  const int Stride1 = out_H * out_W;
+  const int Stride0 = out_W;
   CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
                            this->cl_helper_.CLCommandQueue());
   input_cl_tensor.Resize(input->dims());
@@ -46,21 +50,25 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
 
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
   CL_CHECK_ERRORS(status);
 
-  size_t global_work_size[2] = {width, height};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
 
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, global_work_size, NULL, 0, NULL, NULL);
   CL_CHECK_ERRORS(status);
 }
 
diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp
index 8ea0b3ad3d33f0352fba7697fc08ad7a2039e6ab..e13fbcaed6cd6caec495cafa31b9147a89d54753 100644
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -22,11 +22,11 @@ namespace operators {
 
 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  if (param->InputX()->dims().size() <= 2) {
-    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
-  } else {
-    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  }
+  //  if (param->InputX()->dims().size() <= 2) {
+  //    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  //  } else {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  //  }
   return true;
 }
 
@@ -37,6 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
 
   auto input = param.InputX()->GetCLImage();
   auto *out = param.Out();
+  out->Resize(param.InputX()->dims());
   out->mutable_data<float>();
   const auto &dim = param.InputX()->dims();
   size_t new_dims[] = {1, 1, 1, 1};
@@ -49,11 +50,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
 
   C = new_dims[1];
   in_height = new_dims[2];
-  if (dim.size() <= 2) {
-    in_width = param.InputX()->ImageWidth();
-  } else {
-    in_width = new_dims[3];
-  }
+  //  if (dim.size() <= 2) {
+  //    in_width = param.InputX()->ImageWidth();
+  //  } else {
+  in_width = new_dims[3];
+  //  }
 
   CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
                          this->cl_helper_.CLCommandQueue());
@@ -64,16 +65,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   clSetKernelArg(kernel, 1, sizeof(int), &in_width);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
   clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  if (dim.size() > 2) {
-    int size_ch = in_height * in_width;
-    int size_block = size_ch * 4;
-    int size_batch = size_ch * C;
-    int out_c = new_dims[1];
-    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-    clSetKernelArg(kernel, 7, sizeof(int), &out_c);
-  }
+  //  if (dim.size() > 2) {
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * C;
+  int out_c = new_dims[1];
+  clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  clSetKernelArg(kernel, 7, sizeof(int), &out_c);
+  //  }
 
   //  cl_event wait_event = param.InpdutX()->GetClEvent();
   clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
@@ -93,8 +94,6 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   //            << "ms" << std::endl;
 
   memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
-  DLOG << *param.InputX();
-  DLOG << *out;
 }
 
 template class FetchKernel<GPU_CL, float>;
diff --git a/src/operators/kernel/cl/multiclass_nms_kernel.cpp b/src/operators/kernel/cl/multiclass_nms_kernel.cpp
index e7bf02cde4b5562fd17508a29bd2e820f2e668b6..31ccdc0df5ee50f804f471fc19dae49dcecde3e6 100644
--- a/src/operators/kernel/cl/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/cl/multiclass_nms_kernel.cpp
@@ -15,19 +15,323 @@ limitations under the License. */
 #ifdef MULTICLASSNMS_OP
 
 #include "operators/kernel/multiclass_nms_kernel.h"
-
+#include "operators/math/poly_util.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool MultiClassNMSKernel<GPU_CL, float>::Init(
-    MultiClassNMSParam<GPU_CL> *param) {
+    MultiClassNMSParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
   return true;
 }
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
+                        const bool normalized) {
+  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
+  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
+  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
+    // If coordinate values are is invalid
+    // if area size <= 0,  return 0.
+    return static_cast<T>(0.);
+  } else {
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(const framework::Tensor& bbox,
+                           const framework::Tensor& scores,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int64_t top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        } else {
+          overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                               bbox_data + kept_idx * box_size, box_size, true);
+        }
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const framework::Tensor& scores,
+                   const framework::Tensor& bboxes,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  int64_t class_num = scores.dims()[0];
+  int64_t predict_dim = scores.dims()[1];
+  int num_det = 0;
+  for (int64_t c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    framework::Tensor score = scores.Slice(c, c + 1);
+    /// [c] is key
+    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const framework::Tensor& scores,
+                      const framework::Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      framework::Tensor* outs) {
+  int predict_dim = scores.dims()[1];
+  int box_size = bboxes.dims()[1];
+  int out_dim = bboxes.dims()[1] + 2;
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->data<T>();
+
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * box_size;
+      odata[count * out_dim] = label;           // label
+      odata[count * out_dim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+      count++;
+    }
+  }
+}
+
+template <typename P>
+void MultiClassNMSCompute(const MultiClassNMSParam<GPU_CL>& param,
+                          cl_context context, cl_command_queue commandQueue,
+                          cl_kernel kernel0, cl_kernel kernel1) {
+  auto* input_bboxes_image = param.InputBBoxes();
+  auto& input_bboxes_dims = input_bboxes_image->dims();
+  Tensor* input_bboxes = new Tensor();
+  input_bboxes->Resize(input_bboxes_dims);
+  input_bboxes->mutable_data<float>();
+  DLOG << "yangfei20";
+  framework::CLImageToTensor(input_bboxes_image, input_bboxes, context,
+                             commandQueue, kernel0);
+  DLOG << "yangfei20";
+  auto* input_scores_image = param.InputScores();
+  auto& input_scores_dims = input_scores_image->dims();
+
+  Tensor* input_scores = new Tensor();
+  input_scores->Resize(input_scores_dims);
+  input_scores->mutable_data<float>();
+  framework::CLImageToTensor(input_scores_image, input_scores, context,
+                             commandQueue, kernel0);
+  DLOG << "yangfei20";
+  auto outs_image = param.Out();
+  Tensor* outs = new Tensor();
+  outs->Resize(outs_image->dims());
+  outs->mutable_data<float>();
+  DLOG << *input_bboxes;
+  DLOG << *input_scores;
+  DLOG << *outs;
+  auto background_label = param.BackGroundLabel();
+  auto nms_top_k = param.NMSTopK();
+  auto keep_top_k = param.KeepTopK();
+  auto nms_threshold = param.NMSThreshold();
+  auto nms_eta = param.NMSEta();
+  auto score_threshold = param.ScoreThreshold();
 
+  int64_t batch_size = input_scores_dims[0];
+  int64_t class_num = input_scores_dims[1];
+  int64_t predict_dim = input_scores_dims[2];
+  int64_t box_dim = input_bboxes_dims[2];
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  std::vector<size_t> batch_starts = {0};
+  for (int64_t i = 0; i < batch_size; ++i) {
+    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
+    ins_score.Resize({class_num, predict_dim});
+
+    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+    ins_boxes.Resize({predict_dim, box_dim});
+
+    std::map<int, std::vector<int>> indices;
+    int num_nmsed_out = 0;
+    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
+                         background_label, nms_top_k, keep_top_k, nms_threshold,
+                         nms_eta, score_threshold);
+    all_indices.push_back(indices);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    float* od = outs->mutable_data<float>({1});
+    od[0] = -1;
+  } else {
+    int64_t out_dim = box_dim + 2;
+    outs->mutable_data<float>({num_kept, out_dim});
+    for (int64_t i = 0; i < batch_size; ++i) {
+      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+
+      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
+      int64_t s = batch_starts[i];
+      int64_t e = batch_starts[i + 1];
+      if (e > s) {
+        framework::Tensor out = outs->Slice(s, e);
+        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
+      }
+    }
+  }
+  DLOG << "yangfei20";
+  outs_image->InitEmptyImage(context, commandQueue, outs->dims());
+  framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1);
+  DLOG << *outs;
+  delete (input_bboxes);
+  delete (input_scores);
+  delete (outs);
+  DLOG << "yangfei20";
+}
 template <>
 void MultiClassNMSKernel<GPU_CL, float>::Compute(
-    const MultiClassNMSParam<GPU_CL> &param) {}
+    const MultiClassNMSParam<GPU_CL>& param) {
+  auto kernel0 = this->cl_helper_.KernelAt(0);
+  auto kernel1 = this->cl_helper_.KernelAt(1);
+  MultiClassNMSCompute<float>(param, this->cl_helper_.CLContext(),
+                              this->cl_helper_.CLCommandQueue(), kernel0,
+                              kernel1);
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/prior_box_kernel.cpp b/src/operators/kernel/cl/prior_box_kernel.cpp
index 1f8843787bc68c8be681e4c2a79053714b76dc4e..92764b379e8dad8070407fcf012b4bad73fd19a1 100644
--- a/src/operators/kernel/cl/prior_box_kernel.cpp
+++ b/src/operators/kernel/cl/prior_box_kernel.cpp
@@ -39,6 +39,10 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   const auto &input_aspect_ratio = param.AspectRatios();
   const bool &flip = param.Flip();
   const bool &clip = param.Clip();
+  int isclip = 0;
+  if (clip) {
+    isclip = 1;
+  }
   const float &step_w = param.StepW();
   const float &step_h = param.StepH();
   const float &offset = param.Offset();
@@ -75,6 +79,8 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
       paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
   float *box_height = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
+  float *variancesptr =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
   int idx = 0;
   for (size_t s = 0; s < min_sizes.size(); ++s) {
     auto min_size = min_sizes[s];
@@ -108,6 +114,9 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
       }
     }
   }
+  for (int i = 0; i < variances.size(); i++) {
+    variancesptr[i] = variances[i];
+  }
   cl_int status;
   auto kernel = this->cl_helper_.KernelAt(0);
   auto default_work_size =
@@ -116,7 +125,7 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   int w = default_work_size[1];
   int nh = default_work_size[2];
 
-  std::vector<int64_t> box_shape({1, 1, 1, num_priors});
+  std::vector<int64_t> box_shape({num_priors});
   framework::DDim ddim = framework::make_ddim(box_shape);
 
   framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
@@ -131,16 +140,33 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   cl_mem box_height_Buffer =
       box_height_cl_tensor.mutable_with_data<float>(box_height);
 
-  DLOG << "c_block:" << c_block;
-  DLOG << "w:" << w;
-  DLOG << "nh:" << nh;
-  DLOG << "step_width:" << step_width;
-  DLOG << "step_height:" << step_height;
-  DLOG << "offset:" << offset;
-  DLOG << "img_width:" << img_width;
-  DLOG << "img_height:" << img_height;
-  DLOG << "num_priors:" << num_priors;
-  DLOG << "C:" << C;
+  framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
+                                          this->cl_helper_.CLCommandQueue());
+
+  std::vector<int64_t> variances_shape({4});
+  framework::DDim vddim = framework::make_ddim(variances_shape);
+
+  variances_cl_tensor.Resize(vddim);
+  cl_mem variances_Buffer =
+      variances_cl_tensor.mutable_with_data<float>(variancesptr);
+
+  //            DLOG << "c_block:" << c_block;
+  //            DLOG << "w:" << w;
+  //            DLOG << "nh:" << nh;
+  //            DLOG << "step_width:" << step_width;
+  //            DLOG << "step_height:" << step_height;
+  //            DLOG << "offset:" << offset;
+  //            DLOG << "img_width:" << img_width;
+  //            DLOG << "img_height:" << img_height;
+  //            DLOG << "num_priors:" << num_priors;
+  //            DLOG << "C:" << C;
+  //            DLOG << "isclip:" << isclip;
+  //            printf("param.MinMaxAspectRatiosOrder() =
+  //            %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
+  //            num_priors; i++) {
+  //                DLOG << box_width[i];
+  //                DLOG << box_height[i];
+  //            }
   status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 1, sizeof(int), &w);
@@ -151,28 +177,36 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
+  status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
+  status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &C);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
   CL_CHECK_ERRORS(status);
   size_t global_work_size[2] = {c_block, nh};
   status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                   NULL, global_work_size, NULL, 0, NULL, NULL);
   CL_CHECK_ERRORS(status);
+
   paddle_mobile::memory::Free(box_width);
   paddle_mobile::memory::Free(box_height);
+  paddle_mobile::memory::Free(variancesptr);
 }
 template class PriorBoxKernel<GPU_CL, float>;
 
diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp
index fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a..4e8d3e1d60a4f1fc2cc2f7a93feda18ed0ac0a97 100644
--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
@@ -26,40 +26,76 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
 template <>
 void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
   auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
   const auto *input = param.InputX();
   auto *output = param.Out();
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  auto input_image = input->GetCLImage();
+  auto output_image = output->GetCLImage();
   const auto &inputDim = input->dims();
   const auto &outputDim = output->dims();
-  int dims[4] = {1, 1, 1, 1};
-  int odims[4] = {1, 1, 1, 1};
+  int input_dims[4] = {1, 1, 1, 1};
+  int output_dims[4] = {1, 1, 1, 1};
   // 1 1000 1 1
   for (int i = 0; i < inputDim.size(); i++) {
-    dims[4 - inputDim.size() + i] = inputDim[i];
+    input_dims[4 - inputDim.size() + i] = inputDim[i];
   }
 
   // 1 1 1 1000
   for (int i = 0; i < outputDim.size(); i++) {
-    odims[4 - outputDim.size() + i] = outputDim[i];
+    output_dims[4 - outputDim.size() + i] = outputDim[i];
   }
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
-  clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
-  clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
-  clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
-  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
 
-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
-                         work_size, NULL, 0, NULL, NULL);
+  int out_C = output_dims[1];
+  int out_H = output_dims[2];
+  int out_W = output_dims[3];
+  int in_W = input_dims[3];
+  int in_H = input_dims[2];
+  int in_Stride0 = in_W;
+  int in_Stride1 = input_dims[2] * input_dims[3];
+  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
+  int out_Stride0 = out_W;
+  int out_Stride1 = out_H * out_W;
+  int out_Stride2 = out_C * out_H * out_W;
+  DLOG << "out_C=" << out_C;
+  DLOG << "out_H=" << out_H;
+  DLOG << "out_W=" << out_W;
+  DLOG << "in_W=" << in_W;
+  DLOG << "default_work_size=" << default_work_size;
+  DLOG << "in_Stride0=" << in_Stride0;
+  DLOG << "in_Stride1=" << in_Stride1;
+  DLOG << "out_Stride0=" << out_Stride0;
+  DLOG << "out_Stride1=" << out_Stride1;
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
 }
 
 template class ReshapeKernel<GPU_CL, float>;
diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp
index 22e6672ee462b963476dc72895329a9117fc16a8..6447b68d3376a23e89df6ee635537a8c2ab3bde8 100644
--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
@@ -33,31 +33,24 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
   auto *output = param.Out();
   auto inputImage = input->GetCLImage();
   auto outputImage = output->GetCLImage();
+  const auto &outputDim = output->dims();
 
-  int group = output->ImageWidth();
+  int dims[4] = {1, 1, 1, 1};
+
+  for (int i = 0; i < outputDim.size(); i++) {
+    dims[4 - outputDim.size() + i] = outputDim[i];
+  }
+
+  const int out_W = dims[3];
 
   cl_int status;
 
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &group);
-
-  //  const auto &inputDim = input->dims();
-  //
-  //  int dims[4] = {1, 1, 1, 1};
-  //
-  //  for (int i = 0; i < inputDim.size(); i++) {
-  //    dims[4 - inputDim.size() + i] = inputDim[i];
-  //  }
-  //
-  //  clSetKernelArg(kernel, 2, sizeof(int), &dims);
-  //  clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
-  //  clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
-  //  clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
+  CL_CHECK_ERRORS(status);
   status = clEnqueueNDRangeKernel(
       this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
       default_work_size.data(), NULL, 0, NULL, NULL);
diff --git a/src/operators/kernel/cl/transpose_kernel.cpp b/src/operators/kernel/cl/transpose_kernel.cpp
index 3b41753f186967b7dd419fed05201989d45252b6..d3133449b916193b9bdb57f5398e7a1082a65749 100644
--- a/src/operators/kernel/cl/transpose_kernel.cpp
+++ b/src/operators/kernel/cl/transpose_kernel.cpp
@@ -20,12 +20,113 @@ namespace operators {
 
 template <>
 bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
+  if (param->Out()->dims().size() == 4) {
+    this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
+  } else if (param->Out()->dims().size() < 4) {
+    this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl");
+  }
   return true;
 }
 
 template <>
 void TransposeKernel<GPU_CL, float>::Compute(
-    const TransposeParam<GPU_CL> &param) {}
+    const TransposeParam<GPU_CL> &param) {
+  if (param.Out()->dims().size() == 4) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+    int out_C = param.Out()->dims()[1];
+    int out_H = param.Out()->dims()[2];
+    int out_W = param.Out()->dims()[3];
+    int in_W = param.InputX()->dims()[3];
+    auto output_image = param.Out()->GetCLImage();
+    auto input_image = param.InputX()->GetCLImage();
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "out_W=" << out_W;
+    DLOG << "in_C=" << in_W;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else if (param.Out()->dims().size() == 3) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+    int out_C = param.Out()->dims()[0];
+    int out_H = param.Out()->dims()[1];
+    int out_W = param.Out()->dims()[2];
+    int in_W = param.InputX()->dims()[2];
+    auto output_image = param.Out()->GetCLImage();
+    auto input_image = param.InputX()->GetCLImage();
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "out_W=" << out_W;
+    DLOG << "in_C=" << in_W;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+
+  } else if (param.Out()->dims().size() == 2) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+    int out_C = 1;
+    int out_H = param.Out()->dims()[0];
+    int out_W = param.Out()->dims()[1];
+    int in_W = param.InputX()->dims()[1];
+    auto output_image = param.Out()->GetCLImage();
+    auto input_image = param.InputX()->GetCLImage();
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "out_W=" << out_W;
+    DLOG << "in_C=" << in_W;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
similarity index 58%
rename from src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_add_kernel.cpp
index 6c99750eb824940b32a857ee2baffc72bce05a7a..5ad4c86441f7870b00e6639e7cda22083d3c10d5 100644
--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
@@ -12,49 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADDBNRELU_OP
+#ifdef FUSION_CONVADD_OP
 
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/kernel/conv_add_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddBNReluKernel<FPGA, float>::Init(
-    FusionConvAddBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
+bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
+  bool relu_enabled = false;
   auto input = const_cast<Tensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
   auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
 
-  const int channel = out->dims()[1];
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
   auto bs_ptr =
       (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
   for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + 2] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, param->Groups());
@@ -75,8 +57,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 }
 
 template <>
-void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) {
+void ConvAddKernel<FPGA, float>::Compute(
+    const FusionConvAddParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
similarity index 70%
rename from src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
rename to src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
index 3284ddcdece3ab7fcf4fb4458a59d39c452ad1ce..39d7e818976b56eaea8649392784e7b5dc8b7e1f 100644
--- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE_OP
+#ifdef FUSION_DECONVADD_OP
 
-#include "operators/kernel/conv_transpose_kernel.h"
+#include "operators/kernel/deconv_add_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
   return true;
 }
 
 template <>
-void ConvTransposeKernel<FPGA, float>::Compute(
-    const ConvTransposeParam<FPGA> &param) {}
+void DeconvAddKernel<FPGA, float>::Compute(
+    const FusionDeconvAddParam<FPGA> &param) {}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef2556208a8650a86522264f40f42cb596ec4190
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDRELU_OP
+
+#include "operators/kernel/deconv_add_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddReluKernel<FPGA, float>::Init(
+    FusionDeconvAddReluParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvAddReluKernel<FPGA, float>::Compute(
+    const FusionDeconvAddReluParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8eeb53159411276fbab957c676a01cb31b597c8
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  fpga::format_fp16_ofm(out);
+  auto out_ptr = out->mutable_data<float>();
+
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddKernel<FPGA, float>::Compute(
+    const ElementwiseAddParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp
deleted file mode 100644
index 2c6b616689dca14474d1cbdc3769b438de1358e4..0000000000000000000000000000000000000000
--- a/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
-  auto input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/src/operators/kernel/fpga/V1/split_kernel.cpp b/src/operators/kernel/fpga/V1/split_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8c0bb3be64d2393b61b0f82375c695000f52b65
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/split_kernel.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+
+#include "operators/kernel/split_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
+  auto *in = const_cast<Tensor *>(param->InputX());
+  auto outs = param->Outs();
+  auto sections = param->Sections();
+  int axis = param->Axis();
+  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
+  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
+                        "Output number should be equal to section number");
+  auto image_num = (uint32_t)outs.size();
+  auto images_out =
+      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
+  auto scales_out = reinterpret_cast<float **>(
+      fpga::fpga_malloc(image_num * sizeof(float *)));
+  auto out_channels = reinterpret_cast<uint32_t *>(
+      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
+  for (int i = 0; i < image_num; i++) {
+    fpga::format_fp16_ofm(outs[i]);
+    images_out[i] = outs[i]->mutable_data<float>();
+    scales_out[i] = outs[i]->scale;
+    out_channels[i] = (uint32_t)sections[i];
+  }
+
+  fpga::SplitArgs arg = {0};
+  arg.image_num = image_num;
+  arg.image_in = (half *)in->data<float>();
+  arg.scale_in = in->scale;
+  arg.images_out = images_out;
+  arg.scales_out = scales_out;
+  arg.out_channel_nums = out_channels;
+  arg.height = (uint32_t)in->dims()[2];
+  arg.width = (uint32_t)in->dims()[3];
+
+  param->SetFpgaArgs(arg);
+  return true;
+}
+template <>
+void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
+  fpga::ComputeFPGASplit(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..216cb726e3fe93e9ebfaf328a9ab4ca0725b6bb1
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/kernel/tanh_kernel.h"
+#include <math.h>
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(
+      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
+  fpga::format_fp32_ofm(float_input);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+
+#define EXP_MAX_INPUT 40.0
+template <typename T>
+T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+template <typename T>
+void tanhFuntor(Tensor *input, Tensor *output) {
+  auto *input_ptr = input->data<T>();
+  auto *output_ptr = output->mutable_data<T>();
+  for (int i = 0; i < input->numel(); i++) {
+    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
+  }
+}
+template <>
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate((void *)in_x->data<float>(),
+                        in_x->numel() * sizeof(float));
+  tanhFuntor<float>(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4505401f434c320003e8122a3a0e197441ae8921
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+
+template <>
+void Transpose2Kernel<FPGA, float>::Compute(
+    const Transpose2Param<FPGA> &param) {
+  // Transpose2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
deleted file mode 100644
index ba869aaca7f3f5d5c598feb3837a59a3a738493b..0000000000000000000000000000000000000000
--- a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
-  auto input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  fpga::format_fc_data(filter, out, bs_ptr);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 1dc1ec8933fbf9a7402a5093d80d7b0400d73bb6..b6597b55a97afbe6c5e3efe8c72e627d34206f0e 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -863,6 +863,8 @@ class PriorBoxParam : public OpParam {
     if (HasAttr("min_max_aspect_ratios_order", attrs)) {
       min_max_aspect_ratios_order_ =
           GetAttr<bool>("min_max_aspect_ratios_order", attrs);
+    } else {
+      min_max_aspect_ratios_order_ = false;
     }
     flip_ = GetAttr<bool>("flip", attrs);
     clip_ = GetAttr<bool>("clip", attrs);
@@ -1030,9 +1032,9 @@ class MultiClassNMSParam : public OpParam {
     score_threshold_ = GetAttr<float>("score_threshold", attrs);
   }
 
-  const RType *InputBBoxes() const { return input_bboxes_; }
+  RType *InputBBoxes() const { return input_bboxes_; }
 
-  const RType *InputScores() const { return input_scores_; }
+  RType *InputScores() const { return input_scores_; }
 
   RType *Out() const { return out_; }
 
@@ -1566,6 +1568,20 @@ class TanhParam : public OpParam {
  private:
   RType *input_x_;
   RType *out_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  std::shared_ptr<RType> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  RType *FloatInput() const {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif
 
@@ -2223,7 +2239,6 @@ class DropoutParam : public OpParam {
 };
 #endif
 
-#ifdef CONV_TRANSPOSE_OP
 template <typename Dtype>
 class ConvTransposeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2278,7 +2293,7 @@ class ConvTransposeParam : public OpParam {
   void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
 #endif
 };
-#endif
+
 #ifdef FUSION_DECONVADD_OP
 template <typename Dtype>
 class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
@@ -2434,6 +2449,15 @@ class SplitParam : public OpParam {
   int num;
   std::vector<int> sections;
   //  std::vector<GType> out_ts_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::SplitArgs fpga_split_args;
+
+ public:
+  const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; }
+  void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; }
+#endif
 };
 #endif
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3202e9248ba55afb438a4b4fc894095531aa534f..bfd125ce5b75091cfac1a2a4e2f2f025da0178dc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -67,7 +67,7 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
 
-    ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-densebox paddle-mobile)
 
     set(FOUND_MATCH ON)
@@ -81,7 +81,7 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
     target_link_libraries(test-pe paddle-mobile)
 
-    ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-densebox paddle-mobile)
 
     set(FOUND_MATCH ON)
@@ -372,5 +372,8 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
     target_link_libraries(test-eng paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
+    target_link_libraries(test-super paddle-mobile)
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
diff --git a/test/fpga/test_densebox_combine.cpp b/test/fpga/test_densebox_combine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..056bbe52d89f69a444174846e602b44d9c581d03
--- /dev/null
+++ b/test/fpga/test_densebox_combine.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
+static const char *g_densebox_combine = "../models/densebox";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  // paddle_mobile.SetThreadNum(4);
+  if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model",
+                         std::string(g_densebox_combine) + "/params", true)) {
+    // std::vector<float> input;
+    // std::vector<int64_t> dims{1, 3, 512, 1024};
+    // GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    // auto vec_result = paddle_mobile.Predict(input, dims);
+
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 512, 1024}, static_cast<float>(0),
+                       static_cast<float>(1));
+    // readStream(g_image_src_float,
+    //           input_tensor.mutable_data<float>({1, 3, 224, 224}));
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+  }
+
+  return 0;
+}
diff --git a/test/net/test_super.cpp b/test/net/test_super.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee1fc0f852cd9f30391069032e556796e922fda
--- /dev/null
+++ b/test/net/test_super.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../../src/common/types.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = paddle_mobile::time();
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+
+  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
+                                 std::string(g_super) + "/params", true, false,
+                                 1, true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 1, 300, 300};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result;
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index 0eb11efd19b7d937f93eec14e163c8c42cb77f12..60e907fa6fb301b2e944a6d938f662225a743d41 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -36,16 +36,19 @@ static const char *g_squeezenet = "../models/squeezenet";
 static const char *g_googlenet = "../models/googlenet";
 static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
 static const char *g_mobilenet = "../models/mobilenet";
-static const char *g_mobilenet_mul = "../models/mobilenet_mul";
+static const char *g_mobilenet_mul = "../models/r";
 static const char *g_alexnet = "../models/alexnet";
 static const char *g_inceptionv4 = "../models/inceptionv4";
+static const char *g_inceptionv3 =
+    "../models/InceptionV3_Spatial_Attention_Model";
 static const char *g_nlp = "../models/nlp";
+static const char *g_super = "../models/superresoltion";
 static const char *g_resnet_50 = "../models/resnet_50";
 static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
 static const char *g_yolo_combined = "../models/yolo_combined";
-static const char *g_yolo_mul = "../models/yolo_mul";
+static const char *g_yolo_mul = "../models/d";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
 static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
diff --git a/tools/op.cmake b/tools/op.cmake
index 5901a23a1ff50c357d69cfff63cdfd543dbf8f9d..3a4a0597a44694c4edea8173af47627cb5680df2 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -102,7 +102,6 @@ if (CON GREATER -1)
   set(MUL_OP ON)
   set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)
-
   set(FOUND_MATCH ON)
 endif()
 
@@ -110,24 +109,25 @@ list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
   message("FPGA_NET_V1 enabled")
   set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDBN_OP ON)
   set(FUSION_ELEMENTWISEADDRELU_OP ON)
   set(FUSION_FC_OP ON)
-  set(FUSION_FCRELU_OP ON)
   set(POOL_OP ON)
-  set(CONCAT_OP ON)
   set(SOFTMAX_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
+  set(TANH_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(TRANSPOSE2_OP ON)
   set(FUSION_CONVADD_OP ON)
+  set(SPLIT_OP ON)
+  set(FUSION_DECONVADD_OP ON)
+  set(FUSION_DECONVADDRELU_OP ON)
   set(FOUND_MATCH ON)
 endif()
 
 list(FIND NET "FPGA_NET_V2" CON)
 if (CON GREATER -1)
   message("FPGA_NET_V2 enabled")
-  set(FEED_OP ON)
   set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_ELEMENTWISEADDRELU_OP ON)
   set(FUSION_FC_OP ON)
@@ -135,9 +135,6 @@ if (CON GREATER -1)
   set(SOFTMAX_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_DECONVRELU_OP ON)
-  #set(SLICE_OP ON)
   set(TANH_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(TRANSPOSE2_OP ON)
@@ -145,8 +142,6 @@ if (CON GREATER -1)
   set(SPLIT_OP ON)
   set(FUSION_DECONVADD_OP ON)
   set(FUSION_DECONVADDRELU_OP ON)
-
-
   set(FOUND_MATCH ON)
 endif()