diff --git a/README.md b/README.md
index c9d15d4960a6330ff6614b6dfc8fd20b81386c9c..59ef597dd749ea16658977cd6d548cedaa90d166 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
 
 
-![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png)
+![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
 
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
     arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 14924c4a2129292aca32e307569fc8dc9a00f913..2f366eb9e5a10ea11e3153e6e32b18204c6dd9cd 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,39 +17,39 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-const std::string G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
-const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
-const std::string G_OP_TYPE_FC = "fusion_fc";
-const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const std::string G_OP_TYPE_DROPOUT = "dropout";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
-const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
-const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+const char *G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
     "fusion_elementwise_add_relu";
-const std::string G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
-const std::string G_OP_TYPE_REGION = "region";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/common/types.h b/src/common/types.h
index ae993f8034d7136a4badac2bbaf0353c6ef05222..6066879305d5ea7d1b6dcb0bb618c234338cc171 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -20,7 +20,9 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+enum class Precision : int { FP32 = 0, FP16 = 1 };
+
+typedef int16_t half;
 
 template <Precision p>
 struct PrecisionTrait {
@@ -31,6 +33,10 @@ template <>
 struct PrecisionTrait<Precision::FP32> {
   typedef float ptype;
 };
+template <>
+struct PrecisionTrait<Precision::FP16> {
+  typedef half ptype;
+};
 
 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -73,40 +79,40 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
-extern const std::string G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU;
-
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
-extern const std::string G_OP_TYPE_IM2SEQUENCE;
-extern const std::string G_OP_TYPE_DROPOUT;
-
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN;
-extern const std::string G_OP_TYPE_FUSION_POOL_BN;
-extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
-extern const std::string G_OP_TYPE_FUSION_FC_RELU;
-extern const std::string G_OP_TYPE_REGION;
+extern const char *G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+
+extern const char *G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
index d484d889d8df8f4171658ae395531b84b0ac0a0d..706614e9449eb33e70e261b221d8882a9353ddbf 100644
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -29,15 +29,15 @@ limitations under the License. */
 
 #include "fpga/api/fpga_api.h"
 
-namespace paddle {
-namespace mobile {
+namespace paddle_mobile {
 namespace fpga {
-namespace api {
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
 
-static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
+static inline int do_ioctl(int req, void *arg) {
+  return ioctl(req, (long unsigned int)arg);
+}
 
 int open_device() {
   if (fd == -1) {
@@ -48,8 +48,8 @@ int open_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
-  return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
-                                           MAP_SHARED, fd, 0);
+  return reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
 }
 
 void fpga_free(void *ptr) { munmap(ptr, 0); }
@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
 }
 
-}  // namespace api
+int ComputeFpgaConv(struct FpgaConvArgs args) {}
+int ComputeFpgaPool(struct FpgaPoolArgs args) {}
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs args) {}
+
 }  // namespace fpga
-}  // namespace mobile
-}  // namespace paddle
+}  // namespace paddle_mobile
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
index 65fb1b5d611e8c063d196efa8b8d7ccfa0ff91b3..42e99f4e4238d6974d23c1fb33bf238ca8a8626d 100644
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -14,44 +14,107 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
 #include <cstddef>
 #include <iostream>
 #include <limits>
 
 // memory management;
 
-namespace paddle {
-namespace mobile {
+namespace paddle_mobile {
 namespace fpga {
-namespace api {
 
 int open_device();
 int close_device();
 
-void *fpga_malloc(size_t size);
-void fpga_free(void *ptr);
-void fpga_copy(void *dst, const void *src, size_t num);
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
 
-struct CnnVersionArgs {
-  void *buf;
+struct FpgaVersionArgs {
+  void* buf;
 };
 
-struct QuantArgs {
+struct MemoryToPhysicalArgs {
+  const void* src;
+  uint64_t physical;
+};
+
+struct MemoryCopyArgs {
+  void* src;
+  void* dst;
+  size_t size;
+};
+
+struct FpgaQuantArgs {
   float scale;
 };
 
-struct BatchNormalizationArgs {
-  bool enable;
+struct FpgaBNArgs {
+  bool enabled = false;
+  void* bias_addr;
+  void* scale_addr;
+};
+
+struct FpgaKernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_h;
+  uint32_t stride_w;
+};
+
+struct FpgaImageArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t channels;
+  uint32_t pad_h;
+  uint32_t pad_w;
+};
+
+struct FpgaConvArgs {
+  bool relu_enabled;
+  struct FpgaBNArgs BNargs;
+  void* image_addr;
+  void* filter_addr;
+  void* bias_addr;
+  void* output_addr;
+  float quant_scale;
+  struct FpgaImageArgs image;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct FpgaKernelArgs kernel;
+};
+
+struct FpgaPoolArgs {
+  void* image_addr;
+  void* output_addr;
+  struct FpgaImageArgs image;
+  struct FpgaKernelArgs kernel;
+};
+
+struct FpgaEWAddArgs {
+  bool relu_enabled;
+  void* image0_addr;
+  void* image1_addr;
+  void* result_addr;
+  uint32_t const0;
+  uint32_t const1;
+  uint32_t data_len;  // aligned element count
 };
 
-struct ScaleArgs {};
+int ComputeFpgaConv(struct FpgaConvArgs args);
+int ComputeFpgaPool(struct FpgaPoolArgs args);
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
 
-#define IOCTL_CNN_MAGIC 'CNN'
-#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
-#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
-#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
+#define IOCTL_FPGA_MAGIC 'CNN'
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
+#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
 
-}  // namespace api
 }  // namespace fpga
-}  // namespace mobile
-}  // namespace paddle
+}  // namespace paddle_mobile
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index f16a65c28fb47e1cf4139588742ebe1073c3f3e6..a5890d34c600f6c4f4838ec94c202801b3044d3f 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <map>
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
-#include "node.h"
+#include "framework/program/program-optimize/node.h"
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 56e6d6bf18740489c195a66db70331cbab42aeea..954a65a3605c4d0204890d9414aeb074371b0d69 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -16,14 +16,15 @@ limitations under the License. */
 
 #include <cstdint>
 #include <cstring>
+#include <fstream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
-#include "common/enforce.h"
 
-#include <fstream>
 #include "common/enforce.h"
+#include "common/types.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+      functor;
   size_t size = functor(type);
 
   PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 65f019d1e3c3f6f6bdb8a18a9ff99bb7ecb2012c..d6434b64aa752fd62bc637a882298228d59880b8 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
     memcpy(&max_value, *data + sizeof(float), sizeof(float));
     *data += 2 * sizeof(float);
     const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = (uint8_t *)(*data);
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
     for (int k = 0; k < memory_size; ++k) {
       static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
     }
@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 
 template class Executor<CPU, Precision::FP32>;
-template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<FPGA, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 178541953323b6ffd1a3339f8209c2839b37a784..42b8c4551871c58955251d94845ca13576d7735b 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -27,17 +27,17 @@ namespace memory {
 const int MALLOC_ALIGN = 64;
 
 #ifdef PADDLE_MOBILE_FPGA
-namespace api = paddle::mobile::fpga::api;
+namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
 }
 
-void *Alloc(size_t size) { return api::malloc(size); }
+void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
 
 void Free(void *ptr) {
   if (ptr) {
-    api::fpga_free(ptr);
+    fpga::fpga_free(ptr);
   }
 }
 
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index 19d771ddd5884412624a0720368ecc80f92678ea..f767f3481c999a16da46e75e314e8ebcb54193fa 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #ifdef CONCAT_OP
 
-#include "concat_op.h"
+#include <vector>
+
+#include "operators/concat_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif
 
 #endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 7aedaab4b1fa00707661ada428c7c1dc27f124cd..bad0015917c2a9d4016def26c8a332d076b39c99 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
 USE_OP_MALI_GPU(concat);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(concat);
 #endif
 
 #endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 723747874da8fc8ee2c02eb1be4c89189c2af746..e45ad38fd68cb9b4616b7e363be117e2039c93a9 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
-        param_(inputs, outputs, attrs, *scope) {}
+        param_(inputs, outputs, attrs, scope.get()) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
   void Init() {}
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
index af16048a1b4eba2ff36f842b6cf968031989576e..db942b018d7085ca3986533937328101afb08ff9 100644
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef DROPOUT_OP
 
-#pragma once
-
 #include "operators/kernel/dropout_kernel.h"
 #include <operators/math/transform.h>
 
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
index 5a3783971959db8fba9ca6b701fb6eb6340fcb3f..3ef6b9dd62d88f012eba3456c676ac0d33bf9e52 100644
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c691988f4a388c7835a7016602d7a1ac9cb5f9b6
--- /dev/null
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, half>::Init(ConcatParam *param) {
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<half>();
+
+  DDim out_dim = out->dims();
+  int pixels = out_dim[1] * out_dim[2];
+  auto out_channel = out_dim[3];
+
+  auto out_offset = 0;
+
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto input = inputs[i];
+    auto channels = input[3];
+    out_offset += channels;
+    auto src = input->data<half>();
+    for (int j = 0; j < pixels; ++j) {
+      auto dst = out->data<half>() + out_offset;
+      memory::Copy(dst, src, sizeof(half));
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
index cb592613f73d90dae5a7d6e515f8bc091981776e..aa798fd6af5592a062de207714dc9fee2afb93df 100644
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -20,13 +20,11 @@ limitations under the License. */
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class Im2SequenceKernel
     : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
index a76c3c46012a758a05cf8f846a15376ad1b9f33c..44a7ce2af62a1d27aff8181f6742bebda1d6d066 100755
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef FUSION_FC_OP
 
-#pragma once
-
 #include "operators/kernel/fusion_fc_kernel.h"
 
 namespace paddle_mobile {
diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h
index 9f5dcb23ee9bf44ffa8bbdd98879d533d07c39f9..15696174377f04ad9a62366e03ded1f2cdcdee9e 100644
--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h
index cc76a4b59b0be24dd3a3fb82c0e3d9fb1a4dbf24..98ac71d0bbad86f595171ad7ac5b2a1cdf5908fa 100644
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h
index e308364602f401b1c6c6f8e2e35385aefa055360..fd3b8dc767076c5244509f6015c42bee87df100b 100644
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 4966ca14594cfe4680b4de2f7f56ef85e345e437..b9b61f4d1c59a0e2c8e7822742c54472ad540981 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer) {
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
   const float *a0, *a1, *a2, *a3;
   for (int i = 0; i < m - m_tail; i += MR) {
     a0 = A + i * lda;
@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3, *a4, *a5;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    a4 = A + (i + 4) * lda;
+    a5 = A + (i + 5) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+    }
+  }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
+  a4 = a0 + 4 * lda;
+  a5 = a0 + 5 * lda;
+  if (m_tail != 0) {
+    if (m_tail <= 5) {
+      a5 = zero;
+    }
+    if (m_tail <= 4) {
+      a4 = zero;
+    }
+    if (m_tail <= 3) {
+      a3 = zero;
+    }
+    if (m_tail <= 2) {
+      a2 = zero;
+    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+    }
+  }
+}
+
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer) {
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer) {
   const float *b0;
   for (int j = 0; j < n - n_tail; j += NR) {
     for (int i = 0; i < k; ++i) {
@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
     }
   }
 
@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
     }
   }
 
@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
            const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
 
   KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
 
-  // make sure MC is multiple of 4, and NC is multiple of 8
+  // make sure MC is multiple of MR, and NC is multiple of NR
   int mblock_num = (m + MC - 1) / MC;
   MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
+  MC = (MC + MR - 1) / MR * MR;
   //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
 
   int nblock_num = (n + NC - 1) / NC;
   NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
+  NC = (NC + NR - 1) / NR * NR;
   //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
 
   packedA = static_cast<float *>(
@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
     for (int i = 0; i < m; i += MC) {
       mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
       InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
                   relu);
     }
@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                  bool relu, float *new_scale, float *new_bias) {
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
 
   KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
 
-  // make sure MC is multiple of 4, and NC is multiple of 8
+  // make sure MC is multiple of MR, and NC is multiple of NR
   int mblock_num = (m + MC - 1) / MC;
   MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
+  MC = (MC + MR - 1) / MR * MR;
   //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
 
   int nblock_num = (n + NC - 1) / NC;
   NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
+  NC = (NC + NR - 1) / NR * NR;
   //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
 
   packedA = static_cast<float *>(
@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
     for (int i = 0; i < m; i += MC) {
       mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
       InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
                         &C(i, j), ldc, relu, new_scale + i, new_bias + i);
     }
@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
+
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+  float32x4_t cv8 = vdupq_n_f32(0.0);
+  float32x4_t cv9 = vdupq_n_f32(0.0);
+  float32x4_t cv10 = vdupq_n_f32(0.0);
+  float32x4_t cv11 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+
+  float32x2_t av01;
+  float32x2_t av23;
+  float32x2_t av45;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    av01 = vget_low_f32(av);
+    av23 = vget_high_f32(av);
+    av45 = vld1_f32(a + 4);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+
+    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
+    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
+    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
+    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
+
+    a += MR;
+    b += NR;
+  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+  vst1q_f32(c + 4 * ldc, cv8);
+  vst1q_f32(c + 4 * ldc + 4, cv9);
+  vst1q_f32(c + 5 * ldc, cv10);
+  vst1q_f32(c + 5 * ldc + 4, cv11);
+
+#else
+
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[b_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]            \n\t"
+
+      "vmov.f32   q4,     #0.0          \n\t"
+      "vmov.f32   q5,     #0.0          \n\t"
+      "vmov.f32   q6,     #0.0          \n\t"
+      "vmov.f32   q7,     #0.0          \n\t"
+      "vmov.f32   q8,     #0.0          \n\t"
+      "vmov.f32   q9,     #0.0          \n\t"
+      "vmov.f32   q10,    #0.0          \n\t"
+      "vmov.f32   q11,    #0.0          \n\t"
+      "vmov.f32   q12,    #0.0          \n\t"
+      "vmov.f32   q13,    #0.0          \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "vmov.f32   q15,    #0.0          \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      //      "pld        [%[a_ptr], #128]       \n\t"
+      //      "pld        [%[b_ptr], #128]       \n\t"
+      //      "pld        [%[a_ptr], #192]       \n\t"
+      //      "pld        [%[b_ptr], #192]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "blt        end_kc2_%=              \n\t"
+      "loop_kc2_%=:                       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "bge        loop_kc2_%=             \n\t"
+      "end_kc2_%=:                        \n\t"
+
+      "mov        r5,     %[c]            \n\t"
+      "mov        r6,     %[step]         \n\t"
+      "vst1.32    {q4, q5},   [r5], r6    \n\t"
+      "vst1.32    {q6, q7},   [r5], r6    \n\t"
+      "vst1.32    {q8, q9},   [r5], r6    \n\t"
+      "vst1.32    {q10, q11}, [r5], r6    \n\t"
+      "vst1.32    {q12, q13}, [r5], r6    \n\t"
+      "vst1.32    {q14, q15}, [r5]        \n\t"
+
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index d8b305a7282b871d61ed588b1237f4f8f1cb56f8..2044c264ed1c0f8624690874ed248661a753804c 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
-#define MR 4
+#define MR 6
 #define NR 8
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer);
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer);
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer);
 
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
 
 // 分块矩阵乘法结果回写
 // C = A * B
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index e2795b3aefe3c67df9b51c882298a717a388ae15..dc790ee348e4070bd2891debe21c6f8e7bcdcdf0 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api/fpga_api.h"
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
   Tensor *input_y_;
   Tensor *out_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaEWAddArgs fpga_EW_add_args;
+
+ public:
+  const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
+  void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; }
+#endif
 };
 
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
   vector<int> paddings_;
   bool ceil_mode_;
   bool global_pooling_ = false;
-};
-#endif
-
-#ifdef FUSION_POOLBN_OP
-class FusionPoolBNParam : OpParam {
- public:
-  FusionPoolBNParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    const Scope &scope) {
-    input_ = InputXFrom<LoDTensor>(inputs, scope);
-    pooling_type_ = GetAttr<string>("pooling_type", attrs);
-    ksize_ = GetAttr<vector<int>>("ksize", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
-    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
-    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
-  }
-  const Tensor *Input() const { return input_; }
-
-  const string &PoolingType() const { return pooling_type_; }
-
-  const vector<int> &Ksize() const { return ksize_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  bool isCeilMode() const { return ceil_mode_; }
-
-  bool isGlobalPooling() const { return global_pooling_; }
-
-  Tensor *OutputY() const { return output_y_; }
-
-  const Tensor *InputBias() const { return input_bias_; }
-
-  const Tensor *InputMean() const { return input_mean_; }
-
-  const Tensor *InputScale() const { return input_scale_; }
-
-  const Tensor *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  const string &DataFormat() const { return data_format_; }
+#ifdef PADDLE_MOBILE_FPGA
 
  private:
-  Tensor *input_;
-  string pooling_type_;
-  vector<int> ksize_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  bool ceil_mode_;
-  bool global_pooling_ = false;
-  Tensor *output_y_;
-  Tensor *input_bias_;
-  Tensor *input_mean_;
-  Tensor *input_scale_;
-  Tensor *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  string data_format_;
+  fpga::FpgaPoolArgs fpga_pool_args;
+
+ public:
+  const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; }
+  void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; }
+#endif
 };
 #endif
 
@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
-    auto var = scope.Var("batch_size");
+            const AttributeMap &attrs, Scope *scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
+    out_ = OutFrom<LoDTensor>(outputs, *scope);
+    auto var = scope->Var("batch_size");
     batch_size = var->GetValue<int>();
   }
   const Tensor *InputX() const { return input_x_; }
@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
   int x_num_col_dims_;
   int y_num_col_dims_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 
 #ifdef FUSION_FCRELU_OP
@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
   bool is_test_;
   Tensor *new_bias_;
   Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
   bool is_test_;
   Tensor *new_bias_;
   Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
 };
 #endif
 
-#ifdef REGION_OP
-class RegionParam : public OpParam {};
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h
index 9e7fd6b8029aebfdf4b7c53439936189b0c8eb8e..6cab048dea350d668c92fda56f6b6b197c38093d 100644
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
@@ -33,7 +33,7 @@ class ResizeOp
           DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
  public:
   ResizeOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap attrs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<DeviceType, ResizeParam,
                                       operators::ResizeKernel<DeviceType, T>>(
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f747b8202c07b50511b252cc8217d1a4be7c37a9..418ebff79161675e8b23a4cca8f4319121aa6002 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -145,6 +145,10 @@ else ()
     ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-conv-add-relu-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
+
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
 endif()
diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm.cpp
index 8cb778c458034aecf6cea89fcf0d3e2a3d8118ba..35241fbd535e062be1c7f1f28eb3860d118a3455 100644
--- a/test/common/test_gemm.cpp
+++ b/test/common/test_gemm.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdlib>
+#include <ctime>
 #include <iostream>
 #include "../test_helper.h"
 #include "common/log.h"
@@ -20,13 +22,21 @@ limitations under the License. */
 
 #define a(i, j) a[(i)*lda + (j)]
 #define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
 #define c1(i, j) c1[(i)*ldc + (j)]
 
-#define m 62
-#define n 63
-#define k 74
+void print_matirx(int m, int n, int ldc, float *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << c(i, 0);
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << c(i, j);
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
 
-int main() {
+int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
   int lda = k;
   int ldb = n;
   int ldc = n;
@@ -39,44 +49,88 @@ int main() {
       static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
   float *c1 =
       static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *scale =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+  float *bias =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
 
+  srand(unsigned(time(0)));
   for (int i = 0; i < m * k; ++i) {
-    a[i] = 2;
+    a[i] = t1 + rand() % t2;
   }
   for (int i = 0; i < k * n; ++i) {
-    b[i] = 2;
+    b[i] = t1 + rand() % t2;
   }
-  for (int i = 0; i < m * n; ++i) {
-    c[i] = 2;
-    c1[i] = 2;
+  for (int i = 0; i < m; ++i) {
+    scale[i] = t1 + rand() % t2;
   }
-
-  auto time1 = time();
-  //  paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3,
-  //  c,
-  //                                        ldc);
-  auto time2 = time();
-  DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
-  for (int i = 0; i < m * n; ++i) {
-    std::cout << c[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
-    }
+  for (int i = 0; i < m; ++i) {
+    bias[i] = t1 + rand() % t2;
   }
-  for (int j = 0; j < n; ++j) {
-    for (int i = 0; i < m; ++i) {
-      c1(i, j) *= 0.3;
-      for (int p = 0; p < k; ++p) {
-        c1(i, j) += 0.9 * a(i, p) * b(p, j);
+
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float r = 0;
+      for (int p = 0; p < k; p++) {
+        r += a(i, p) * b(p, j);
       }
+      r *= scale[i];
+      r += bias[i];
+      if (relu && (r < 0)) {
+        r = 0;
+      }
+      c1(i, j) = r;
     }
   }
-  std::cout << "正确结果对比:" << std::endl;
+
+  paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3,
+                                              c, ldc, relu, scale, bias);
+  int eq = 0;
+  int neq = 0;
   for (int i = 0; i < m * n; ++i) {
-    std::cout << c1[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
+    if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
+      ++eq;
+    } else {
+      ++neq;
     }
   }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+  paddle_mobile::memory::Free(scale);
+  paddle_mobile::memory::Free(bias);
+
+  return 0;
+}
+
+int main() {
+  do_sgemm(9, 9, 9, true, 10, 10, 10);
+  do_sgemm(10, 6, 12, false, 10, 10, 0);
+  do_sgemm(512, 256, 384, false, 10, 10, 0);
+  do_sgemm(1366, 768, 256, false, 10, 10, 0);
+  do_sgemm(1255, 755, 333, false, 10, 10, 0);
+  do_sgemm(555, 777, 999, false, 10, 10, 0);
+
+  do_sgemm(10, 6, 12, true, -4, 10, 0);
+  do_sgemm(512, 256, 384, true, -4, 10, 0);
+  do_sgemm(1366, 768, 256, true, -4, 10, 0);
+  do_sgemm(1255, 755, 333, true, -4, 10, 0);
+  do_sgemm(555, 777, 999, true, -4, 10, 0);
   return 0;
 }
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index c9ab4783d6826992ee81ffd63b0391169645576c..93847af20a6d48a6df33dc50f6c6a1db76facf51 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
   Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
       : Executor<DeviceType>() {
     this->use_optimize_ = use_optimize;
     this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
       LOG(paddle_mobile::LogLevel::kLOG_ERROR)
           << "to_predict_program_ == nullptr";
     }
+
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         this->to_predict_program_->Blocks();
     for (std::shared_ptr<BlockDesc> block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
-        if (op->Type() == op_type) {
+      for (int i = 0; i < ops.size(); ++i) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
           DLOG << "匹配到: " << op->Type();
 
           /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                       op->Type(), op->GetInputs(), op->GetOutputs(),
                       op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
         }
       }
     }
     this->InitMemory();
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
   }
 
   template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
     auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>(dDim);
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
         this->to_predict_program_->Block(0);
     for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
       op->Run();
     }
 
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
   }
 };
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index bea7d4ba7d2df1344f0819222fbdb389106fa77e..25cad4feaa706899122902dee2a8f0c915e78975 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+
 #include "../test_helper.h"
 #include "io/loader.h"
 
@@ -20,12 +22,10 @@ int main() {
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   //  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_mobilenet_ssd, true);
 
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  //  auto program = loader.Load(g_googlenet_combine + "/model",
-  //  g_googlenet_combine +
-  //    "/params", true);
-
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index a3d780a4854d018f948af2890bfe9f1e7a8fefef..9b4e5f2d3a431001e138977b78994f5dfedbe0a3 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -20,22 +20,20 @@ int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model",
-                                 g_mobilenet_ssd_gesture + "/params", true);
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
   //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
   if (isok) {
     auto time2 = time();
     std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
 
+    std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
+    GetInput<float>(g_hand, &input, dims);
 
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
     auto time3 = time();
-    paddle_mobile.Predict(input, dims);
+    auto output = paddle_mobile.Predict(input, dims);
     auto time4 = time();
     std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
               << std::endl;
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 95ffc59c394782b69d17f16c549b0e6923fd31e8..9fc7226fc12fa7a0c631c9920487c0bd56c90816 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -24,19 +24,21 @@ int main() {
     auto time2 = time();
     std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
 
+    std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    auto time3 = time();
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    auto time4 = time();
-
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+
+    for (int i = 0; i < 10; ++i) {
+      auto time3 = time();
+      auto vec_result = paddle_mobile.Predict(input, dims);
+      auto time4 = time();
+      std::vector<float>::iterator biggest =
+          std::max_element(std::begin(vec_result), std::end(vec_result));
+      std::cout << " Max element is " << *biggest << " at position "
+                << std::distance(std::begin(vec_result), biggest) << std::endl;
+      std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+                << std::endl;
+    }
   }
 
   return 0;
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81400d987195364c06b4b93d0859469b43f90e7b
--- /dev/null
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+
+  std::cout << "executor 4 test: " << std::endl;
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  DLOG << " fuck: " << input;
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index fb6724f9c5764497ec81de0d73406709f098e0e0..658af447d6cfcd85c68ff350b104c2468d442e40 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -24,18 +24,22 @@ limitations under the License. */
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_mobilenet_ssd_gesture =
-    "../models/mobilenet+ssd_gesture";
-static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
-static const std::string g_resnet_50 = "../models/resnet_50";
-static const std::string g_resnet = "../models/resnet";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_yolo = "../models/yolo";
-static const std::string g_test_image_1x3x224x224 =
+static const char *g_ocr = "../models/ocr";
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
+static const char *g_squeezenet = "../models/squeezenet";
+static const char *g_googlenet = "../models/googlenet";
+static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_resnet_50 = "../models/resnet_50";
+static const char *g_resnet = "../models/resnet";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_yolo = "../models/yolo";
+static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_hand = "../images/hand_image";
+
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
 
diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake
index a57d9c102ff65d4c10cc9bd3773ffa4c87e482fa..55b90ba65260b99d9af4a29832ed6f8ff5b235c8 100644
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -65,6 +65,8 @@ endif()
 file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
 
 # Android NDK revision
+message("${ANDROID_NDK}")
+
 file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
 set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
   "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
@@ -159,7 +161,7 @@ endif()
 
 # Default values for configurable variables.
 if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN clang)
+  set(ANDROID_TOOLCHAIN gcc)
 endif()
 if(NOT ANDROID_ABI)
   set(ANDROID_ABI armeabi-v7a)
diff --git a/tools/build.sh b/tools/build.sh
index db809f71076e6b6d4aacc53bd8e144db3935cb91..bf3545ef162c86c16c0877f5f25f3a1e09de1fd4 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -63,7 +63,7 @@ build_for_android() {
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
 
-    if [ "${#NETS}" > 1 ]; then
+    if [ "${#NETS}" -gt 1 ]; then
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -99,7 +99,7 @@ build_for_ios() {
     BUILD_DIR=../build/release/"${PLATFORM}"/
     TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
     mkdir -p "${BUILD_DIR}"
-    if [ "${#NETS}" > 1 ]; then
+    if [ "${#NETS}" -gt 1 ]; then
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
diff --git a/tools/op.cmake b/tools/op.cmake
index ec9768443c5e9825931111803acf1f51c1aa1acd..361381b81a603274207e50aeb8f0feddcff4e2ed 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
   set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDBN_OP ON)
-  set(FUSION_POOLBN_OP ON)
   set(FUSION_ELEMENTWISEADDRELU_OP ON)
   set(FUSION_FC_OP ON)
   set(FUSION_FCRELU_OP ON)
-  set(REGION_OP ON)
   set(POOL_OP ON)
   set(CONCAT_OP ON)
   set(SOFTMAX_OP ON)