diff --git a/README.md b/README.md
index de8fe0bb7f4613bd9d6dfebd82db1d407ee682f4..fb4daf3bde4658223cff6e2ebdad55d78412f339 100644
--- a/README.md
+++ b/README.md
@@ -26,22 +26,6 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 
 - **ARM CPU**
 
-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟960(ms)|110.586|63.285|38.215|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|220.248|128.473|79.334|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|341.965|228.724|161.531|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|84.080|55.641|37.182|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟960(ms)|129.445|80.627|50.936|
-
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
     arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
     
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01c610ce5b445bc603da3c0dc43ad21c35d95ae6
--- /dev/null
+++ b/src/fpga/api.cpp
@@ -0,0 +1,168 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include "api.h"
+
+#define FPGA_TEST_MODE
+#ifdef FPGA_TEST_MODE
+#include "common/log.h"
+#endif
+
+namespace paddle_mobile {
+namespace fpga {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+
+static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  return ioctl(req, (unsigned int64_t)arg);
+#else
+  return -1;
+#endif
+}
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  return reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+#else
+  return malloc(size);
+#endif
+}
+
+void fpga_free(void *ptr) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  munmap(ptr, 0);
+#else
+  free(ptr);
+#endif
+}
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int ComputeFpgaConv(const struct ConvArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "   relu_enabled:" << args.relu_enabled << "   const0:" << args.const0
+       << "   const1:" << args.const1;
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "   layout_type:" << args.layout_type
+       << "   convert_type:" << args.convert_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api.h
similarity index 100%
rename from src/fpga/api/fpga_api.h
rename to src/fpga/api.h
diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
deleted file mode 100644
index 779c846d1f3c465e5113f805b2b3856a1a7894c5..0000000000000000000000000000000000000000
--- a/src/fpga/api/fpga_api.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-
-#include "fpga/api/fpga_api.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-
-static inline int do_ioctl(int req, const void *arg) {
-  return ioctl(req, (unsigned int64_t)arg);
-}
-
-int open_device() {
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  return fd;
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-  return reinterpret_cast<void *>(
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-}
-
-void fpga_free(void *ptr) { munmap(ptr, 0); }
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int ComputeFpgaConv(const struct ConvArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-int PerformBypass(const struct BypassArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp
deleted file mode 100644
index 8b351f1a81e0a92f0e2f12a3f61dd2a7d3948c85..0000000000000000000000000000000000000000
--- a/src/fpga/fpga_quantilization.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/fpga_quantilization.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace fpga {
-
-template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width) {
-  int offset_height = 0;
-
-  for (int n = 0; n < num; n++) {
-    int amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_out + offset_height + w * channel + c) = *(data_in++);
-        }
-      }
-    }
-    data_out += num;
-  }
-}
-
-template <typename Dtype>
-static Dtype find_max(Dtype* data, int num) {
-  Dtype max = 0;
-  for (int i = 0; i < num; ++i) {
-    max = std::max(max, data[i]);
-  }
-  return max;
-}
-
-// template <typename Dtype>
-void quantify_filter(framework::Tensor* filter) {
-  DLOG << "quantilize_filter........";
-
-  float scale = 0;
-  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
-
-  const int batch_size = filter->dims()[0];
-  const int channel = filter->dims()[1];
-  const int height = filter->dims()[2];
-  const int width = filter->dims()[3];
-
-  int8_t* int_data = nullptr;
-  int8_t* tmp_data = new int8_t[filter->numel()];
-
-  // 32bit filter -> 8bit filter;
-  if (filter->type() == typeid(float)) {
-    float* float_data = filter->data<float>();
-    float max = find_max<float>(float_data, filter->numel());
-
-    scale = (max / fix_range);
-
-    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = (int8_t)float_data[i] * scale;
-    }
-    int_data = filter->mutable_data<int8_t>();
-  } else {
-    int8_t max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
-    scale = (max / fix_range);
-
-    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = filter->data<int8_t>()[i];
-    }
-    int_data = filter->mutable_data<int8_t>();
-  }
-  // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
-  delete tmp_data;
-  *(filter->fpga_args().scale_pointer()) = scale;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/src/fpga/quantization.cpp b/src/fpga/quantization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44994d4c353490b533110d0965fb63b4fb5c7aa2
--- /dev/null
+++ b/src/fpga/quantization.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/quantization.h"
+#include <algorithm>
+
+namespace paddle_mobile {
+namespace fpga {
+
+template <typename Dtype>
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width) {
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_out + offset_height + w * channel + c) = *(data_in++);
+        }
+      }
+    }
+    data_out += num;
+  }
+}
+
+template <typename Dtype>
+static Dtype find_max(Dtype* data, int64_t num) {
+  Dtype max = 0;
+  for (int i = 0; i < num; ++i) {
+    Dtype value = data[i];
+    Dtype abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+// template <typename Dtype>
+void quantize_filter(framework::Tensor* filter) {
+  DLOG << "quantilize_filter........" << filter->dims();
+
+  float scale = 0;
+  auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);
+
+  auto* tmp_data = new int8_t[filter->numel()];
+
+  // 32bit filter -> 8bit filter;
+  if (filter->type() == typeid(float)) {
+    auto* float_data = filter->data<float>();
+    auto max = find_max<float>(float_data, filter->numel());
+
+    scale = (fix_range / max);
+    DLOG << "scale:" << scale;
+
+    for (int i = 0; i < filter->numel(); ++i) {
+      tmp_data[i] = (int8_t)(float_data[i] * scale);
+    }
+  } else {
+    auto max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
+    scale = (fix_range / max);
+    std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
+  }
+
+  if (filter->dims().size() == 4) {
+    const auto batch_size = filter->dims()[0];
+    const auto channel = filter->dims()[1];
+    const auto height = filter->dims()[2];
+    const auto width = filter->dims()[3];
+    chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                       channel, height, width);
+  } else if (filter->dims().size() == 2) {
+    std::memcpy(filter->mutable_data<int8_t>(), tmp_data,
+                (size_t)filter->numel());
+  }
+
+  delete tmp_data;
+  filter->SetFpgaScale(scale);
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/quantization.h
similarity index 80%
rename from src/fpga/fpga_quantilization.h
rename to src/fpga/quantization.h
index 4f1f6ad402a3ff4df773ecbd2121820f4c7dc265..0d6c2405fccd814f73d44eef20b6735dc0ad0eab 100644
--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/quantization.h
@@ -21,11 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 
 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width);
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width);
 
-// template <typename Dtype>
-void quantify_filter(framework::Tensor* filter);
+void quantize_filter(framework::Tensor* filter);
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 797fcf5bffbe5e738fe352d1ca84602f0e5d86a0..6fc16a01a2874f04ecea3edb89774f4deea93dd5 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -64,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
       functor;
   size_t size = functor(type);
 
@@ -115,8 +116,8 @@ class Tensor {
     PADDLE_MOBILE_ENFORCE(
         (std::is_same<T, void>::value ||
          holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s",
-        this->holder_->type().name());
+        "Tensor holds the wrong type, it holds %s ,requested:%s",
+        this->holder_->type().name(), typeid(T).name());
 
     return reinterpret_cast<const T *>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -255,14 +256,26 @@ class Tensor {
 
 #ifdef PADDLE_MOBILE_FPGA
   struct FPGAArgs {
-    float scale;
+    friend class Tensor;
+
+    inline float *scale_pointer() { return scale_; }
+    inline float scale() { return *scale_; }
 
-    inline float *scale_pointer() { return &scale; }
+   private:
+    float *scale_;
   };
 
   struct FPGAArgs fpga_args() const {
-    return fpgaArgs_;
+    FPGAArgs args;
+    args.scale_ = scale.get();
+    return args;
   }
+
+  void SetFpgaScale(float s) { *(scale.get()) = s; }
+
+ private:
+  std::shared_ptr<float> scale = std::make_shared<float>(0);
+
 #endif
 
  private:
@@ -331,10 +344,6 @@ class Tensor {
    * begins.
    */
   size_t offset_;
-
-#ifdef PADDLE_MOBILE_FPGA
-  FPGAArgs fpgaArgs_;
-#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
@@ -342,9 +351,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
   printer << " dims: " << tensor.dims() << "\n";
   int stride = tensor.numel() / 20;
   stride = stride > 0 ? stride : 1;
+#ifndef PADDLE_MOBILE_FPGA
   for (int i = 0; i < tensor.numel(); i += stride) {
     printer << tensor.data<float>()[i] << " ";
   }
+#endif
+
   return printer;
 }
 
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 8902543347b2db7caee7126b2a28fa460ca741db..2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #ifdef PADDLE_MOBILE_FPGA
 
-#include "fpga/api/fpga_api.h"
+#include "fpga/api.h"
 
 #endif
 
@@ -26,7 +26,7 @@ namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
 
-#ifdef PADDLE_MOBILE_FPGA__VV
+#ifdef PADDLE_MOBILE_FPGA
 namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 286b0076ef2b9ad806f141c4d6124f1233dc78dc..5969e679552345d25c8c9c7a4950eb3b6d72eca2 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -38,10 +38,15 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
   }
 
 #ifdef PADDLE_MOBILE_FPGA
-  void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); }
+
   void Init() {
+    Tensor *output = param_.Out();
+    output->mutable_data<half>();
+  }
+
+  void RunImpl() const {
     const Tensor *input = param_.InputX();
-    auto input_ptr = (const_cast<Tensor *>(input))->mutable_data<float>();
+    auto input_ptr = input->data<float>();
     Tensor *output = param_.Out();
     auto output_ptr = output->mutable_data<half>();
     fpga::BypassArgs args;
@@ -52,12 +57,12 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
     args.image.height = input->dims()[2];
     args.image.width = input->dims()[3];
     args.output.address = output_ptr;
-    param_.SetFpgaArgs(args);
+    fpga::PerformBypass(args);
   }
 
 #else
-  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
   void Init() {}
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 #endif
 
  protected:
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
index 93bbfe9c1a8ae3d9930c759ba0efcef04e5e572f..152b200cfa88d010bb4c8e8022c01ee3663cc179 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 
 #include "operators/kernel/conv_add_bn_kernel.h"
-#include "fpga/api/fpga_api.h"
-#include "fpga/fpga_quantilization.h"
+#include "fpga/api.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -37,11 +37,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
   auto bn_scale_ptr = param->InputScale()->data<float>();
   auto bn_bias_ptr = param->InputBias()->data<float>();
   const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
                             bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Image channel should be equal to bias number");
+                        "Output channel should be equal to bias number");
 
-  const int channel = input->dims()[1];
+  const int channel = out->dims()[1];
   float *bs_ptr =
       reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
   Tensor *new_scale = new Tensor();
@@ -60,8 +60,8 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
 
-  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  fpga::quantize_filter(filter);
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index d5e79a39b79494d543e6e9485497a540a15152aa..caa1e94c6bb9b583efb15e181d46c80f0b66c7ff 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include "fpga/fpga_quantilization.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -35,11 +35,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
   auto bn_scale_ptr = param->InputScale()->data<float>();
   auto bn_bias_ptr = param->InputBias()->data<float>();
   const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
                             bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Image channel should be equal to bias number");
+                        "Output channel should be equal to bias number");
 
-  const int channel = input->dims()[1];
+  const int channel = out->dims()[1];
   float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
   Tensor *new_scale = new Tensor();
   Tensor *new_bias = new Tensor();
@@ -56,8 +56,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
   }
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
-  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  fpga::quantize_filter(filter);
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 3b44506f65cc6700323c3d5f7d0765c9e52f7e0a..33e55773ad0be4f174916f0e5f066b6eeec1d46e 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP
 
 #include "operators/kernel/conv_add_relu_kernel.h"
-#include "fpga/fpga_quantilization.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -31,17 +31,17 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
   Tensor *out = param->Output();
   auto out_ptr = out->mutable_data<half>();
 
-  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0],
-                        "Image channel should be equal to bias number");
-  int channel = input->dims()[1];
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
   float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
   for (int i = 0; i < channel; i++) {
     bs_ptr[i * 2] = 1;
     bs_ptr[i * 2 + 1] = bias_ptr[i];
   }
 
-  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  fpga::quantize_filter(filter);
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index fd95f47a1fcb8c444172909abc67ad7f5e0de632..3ad65a254f95bde431efbd3c5995df6cc2295d3d 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP
 
 #include "operators/kernel/conv_bn_kernel.h"
-#include "fpga/api/fpga_api.h"
-#include "fpga/fpga_quantilization.h"
+#include "fpga/api.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -35,10 +35,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam *param) {
   auto bn_scale_ptr = param->InputScale()->data<float>();
   auto bn_bias_ptr = param->InputBias()->data<float>();
   const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(input->dims()[1] == param->InputBias()->dims()[0],
-                        "Image channel should be equal to bias number");
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
 
-  const int channel = input->dims()[1];
+  const int channel = out->dims()[1];
   float *bs_ptr =
       reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
   Tensor *new_scale = new Tensor();
@@ -55,8 +55,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam *param) {
   }
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
-  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  fpga::quantize_filter(filter);
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index fbb3ca512ea863c49ca4da3f9a133f8c91897b53..18ef4b4e15e488f01a435d89218992e63873bb14 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 
 #include "operators/kernel/conv_bn_relu_kernel.h"
-#include "fpga/fpga_quantilization.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -33,10 +33,10 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam *param) {
   auto bn_scale_ptr = param->InputScale()->data<float>();
   auto bn_bias_ptr = param->InputBias()->data<float>();
   const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(input->dims()[1] == param->InputBias()->dims()[0],
-                        "Image channel should be equal to bias number");
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
 
-  const int channel = input->dims()[1];
+  const int channel = out->dims()[1];
   float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
   Tensor *new_scale = new Tensor();
   Tensor *new_bias = new Tensor();
@@ -52,8 +52,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam *param) {
   }
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
-  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  fpga::quantize_filter(filter);
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
index 21e334b12b70be1980d9417ed11161143106d1c6..fb6a3e7508bf11f0bba1c3e34c065fa63caa2100 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef FUSION_FCRELU_OP
 #include "operators/kernel/fc_relu_kernel.h"
-#include "fpga/api/fpga_api.h"
+
+#include "fpga/api.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -23,8 +25,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam *param) {
   bool relu_enabled = true;
   const Tensor *input_x = param->InputX();
   auto input_x_ptr = input_x->data<half>();
-  const Tensor *input_y = param->InputY();
-  auto input_y_ptr = input_y->data<float>();
+  Tensor *input_y = param->InputY();
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   Tensor *out = param->Out();
@@ -32,13 +33,16 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam *param) {
 
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
                         "Image channel should be equal to weight number");
-  int channel = input_x->dims()[1];
+  int channel = out->dims()[1];
   float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
   for (int i = 0; i < channel; i++) {
     bs_ptr[i * 2] = 1;
     bs_ptr[i * 2 + 1] = input_z_ptr[i];
   }
 
+  fpga::quantize_filter(input_y);
+  auto input_y_ptr = input_y->data<int8_t>();
+
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
   convArgs.filter_address = (void *)input_y_ptr;
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
index 505b8768565dc4003152c3493b558448f9d73d04..5479deb6c19cf085dcea03555e4895d4ad98c4e8 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef FUSION_FC_OP
 
 #include "operators/kernel/fusion_fc_kernel.h"
+#include "fpga/quantization.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -23,8 +24,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) {
   bool relu_enabled = false;
   const Tensor *input_x = param->InputX();
   auto input_x_ptr = input_x->data<half>();
-  const Tensor *input_y = param->InputY();
-  auto input_y_ptr = input_y->data<float>();
+  Tensor *input_y = param->InputY();
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   Tensor *out = param->Out();
@@ -32,13 +32,16 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) {
 
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
                         "Image channel should be equal to weight number");
-  int channel = input_x->dims()[1];
+  int channel = out->dims()[1];
   float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
   for (int i = 0; i < channel; i++) {
     bs_ptr[i * 2] = 1;
     bs_ptr[i * 2 + 1] = input_z_ptr[i];
   }
 
+  fpga::quantize_filter(input_y);
+  auto input_y_ptr = input_y->data<int8_t>();
+
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
   convArgs.filter_address = (void *)input_y_ptr;
@@ -55,11 +58,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) {
   convArgs.image.width = input_x->dims()[3];
   convArgs.image.pad_height = 0;
   convArgs.image.pad_width = 0;
-  convArgs.image.scale_address =
-      input_x->fpga_args().scale_pointer();  // fc input has scale attribute??
+  convArgs.image.scale_address = input_x->fpga_args().scale_pointer();
   convArgs.output.address = (void *)out_ptr;
-  convArgs.output.scale_address =
-      out->fpga_args().scale_pointer();  // fc output has scale attribute??
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
   param->SetFpgaArgs(convArgs);
   return true;
 }
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d58ab0f751eeb584f286a0920d08e9473be38402
--- /dev/null
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "../softmax_kernel.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "common/types.h"
+#include "fpga/api.h"
+#include "operators/math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
+  const Tensor *input = param->InputX();
+  if (input->type() == typeid(half)) {
+    auto input_ptr = input->data<half>();
+    auto output_ptr = param->Out();
+    fpga::BypassArgs args;
+    args.convert_type = fpga::DATA_FP16_TO_FP32;
+    args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
+    args.image.address = (void *)(input_ptr);
+    args.image.height = input->dims()[0];
+    args.image.width = input->dims()[1];
+    args.image.channels = 1;
+    args.output.address = output_ptr;
+    param->SetFpgaArgs(args);
+  }
+
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam &param) const {
+  // SoftmaxCompute<float>(param);
+}
+
+template class SoftmaxKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index f61599ab51e6a06c26dd188d5a1b33aa8b1df200..06da537e419f3a54ffc9986b12274f9853f12774 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "framework/tensor.h"
 #include "framework/variable.h"
 #ifdef PADDLE_MOBILE_FPGA
-#include "fpga/api/fpga_api.h"
+#include "fpga/api.h"
 #endif
 
 namespace paddle_mobile {
@@ -585,6 +585,21 @@ class SoftmaxParam : public OpParam {
  private:
   Tensor *input_x_;
   Tensor *out_;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  std::shared_ptr<Tensor> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  Tensor *FloatInput() {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif
 
@@ -670,16 +685,6 @@ class FeedParam : public OpParam {
   Tensor *input_x_;
   Tensor *out_;
   int batch_size;
-
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#endif
 };
 
 class FetchParam : public OpParam {
@@ -1143,7 +1148,6 @@ class FusionConvBNParam : public OpParam {
   FusionConvBNParam(const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     const Scope &scope) {
-    axis_ = GetAttr<int>("axis", attrs);
     filter_ = FilterFrom<LoDTensor>(inputs, scope);
     input_ = InputFrom<LoDTensor>(inputs, scope);
     output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
@@ -1160,8 +1164,6 @@ class FusionConvBNParam : public OpParam {
     //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
 
-  const int &Axis() const { return axis_; }
-
   const Tensor *Input() const { return input_; }
 
 #ifdef PADDLE_MOBILE_FPGA
@@ -1202,7 +1204,6 @@ class FusionConvBNParam : public OpParam {
   const Tensor *NewBias() const { return new_bias_; }
 
  protected:
-  int axis_;
   Tensor *input_;
   Tensor *output_y_;
   Tensor *filter_;
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index c9edfccf4ff08e5a12d735526c3d63c689711357..e85edc69c3291c794f2eeb8119b91b2926c4d870 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
 
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index f645d7edf7a3b9f7a92cf286feec58e960a5e3b7..bacae23b522daf1cc689a2d7af6b14cd2bc794bb 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -55,6 +55,7 @@ USE_OP_CPU(softmax);
 USE_OP_MALI_GPU(softmax);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(softmax);
 #endif
 
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 632f1f55c24c524ee56a15e91940517fc44af06c..8f92b6dab9e5c2c51c485f61fa2860926ce50b1f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -27,10 +27,14 @@ elseif("resnet" IN_LIST NET)
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
 elseif("FPGAnets" IN_LIST NET)
-    # ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
-    # target_link_libraries(test-resnet paddle-mobile)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
+
     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-tensor-quant paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-fpga-concat-op paddle-mobile)
 elseif("mobilenetssd" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d1a5828b36b3d9ed371a271af6db82657ff1596
--- /dev/null
+++ b/test/fpga/test_concat_op.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/concat_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::FPGA,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
+  int input_n = 1;
+  int input_c = 2;
+  int input_h = 0;
+  int input_w = 1;
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
+  /// inputx1 (4,10,2,2),
+  /// inputx2 (4,20,2,2),
+  /// inputx3 (4,30,2,2),
+  /// inputx4 (4,40,2,2),
+  /// axis = 1
+  /// output (4,100,2,2)
+  int input_index =
+      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
+  int output_index = input_n * 100 * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
+                     input_h * 2 + input_w;
+
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
+  return 0;
+}
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index 73ac88ef77b0c02545ef55b6493d4681c61c192d..82fdc22763d11d4b06439465d56d0e6fa663a317 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -17,7 +17,13 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
   paddle_mobile.SetThreadNum(4);
   auto time1 = time();
   if (paddle_mobile.Load(g_resnet, true)) {
diff --git a/tools/op.cmake b/tools/op.cmake
index 8f5ffb52aeae29c76d0d456a1392b5411cb5d04a..fd2b103842a3017fa5c93d39602a4c2bee47d94e 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -86,6 +86,8 @@ if ("resnet" IN_LIST NET)
   set(RELU_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(POOL_OP ON)
+  set(BATCHNORM_OP ON)
+  set(MUL_OP ON)
   set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)