diff --git a/src/common/types.h b/src/common/types.h
index 7745f80a9ca2ef6f0258f6f2eacf45761d29a00e..6066879305d5ea7d1b6dcb0bb618c234338cc171 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -20,7 +20,9 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+enum class Precision : int { FP32 = 0, FP16 = 1 };
+
+typedef int16_t half;
 
 template <Precision p>
 struct PrecisionTrait {
@@ -31,6 +33,10 @@ template <>
 struct PrecisionTrait<Precision::FP32> {
   typedef float ptype;
 };
+template <>
+struct PrecisionTrait<Precision::FP16> {
+  typedef half ptype;
+};
 
 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
index 2dfc285af4506c055f6780d7b3d393433c0904a8..42e99f4e4238d6974d23c1fb33bf238ca8a8626d 100644
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -14,36 +14,35 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
 #include <cstddef>
 #include <iostream>
 #include <limits>
 
 // memory management;
 
-namespace paddle {
-namespace mobile {
+namespace paddle_mobile {
 namespace fpga {
-namespace api {
 
 int open_device();
 int close_device();
 
-void *fpga_malloc(size_t size);
-void fpga_free(void *ptr);
-void fpga_copy(void *dst, const void *src, size_t num);
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
 
 struct FpgaVersionArgs {
-  void *buf;
+  void* buf;
 };
 
 struct MemoryToPhysicalArgs {
-  const void *src;
+  const void* src;
   uint64_t physical;
 };
 
 struct MemoryCopyArgs {
-  void *src;
-  void *dst;
+  void* src;
+  void* dst;
   size_t size;
 };
 
@@ -51,38 +50,71 @@ struct FpgaQuantArgs {
   float scale;
 };
 
-struct FpgaBNArgs {};
+struct FpgaBNArgs {
+  bool enabled = false;
+  void* bias_addr;
+  void* scale_addr;
+};
+
+struct FpgaKernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_h;
+  uint32_t stride_w;
+};
+
+struct FpgaImageArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t channels;
+  uint32_t pad_h;
+  uint32_t pad_w;
+};
 
 struct FpgaConvArgs {
-  bool enable_BN = false;
-  bool enable_Relu = false;
-  struct FpgaBNParam bn_parm;
+  bool relu_enabled;
+  struct FpgaBNArgs BNargs;
+  void* image_addr;
+  void* filter_addr;
+  void* bias_addr;
+  void* output_addr;
+  float quant_scale;
+  struct FpgaImageArgs image;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct FpgaKernelArgs kernel;
 };
 
 struct FpgaPoolArgs {
-  bool enable_BN = false;
-  struct FpgaBNParam bn_parm;
+  void* image_addr;
+  void* output_addr;
+  struct FpgaImageArgs image;
+  struct FpgaKernelArgs kernel;
 };
 
-struct FpgaEWAddArgs {  // only support X + Y
-  bool enable_Relu = false;
+struct FpgaEWAddArgs {
+  bool relu_enabled;
+  void* image0_addr;
+  void* image1_addr;
+  void* result_addr;
+  uint32_t const0;
+  uint32_t const1;
+  uint32_t data_len;  // aligned element count
 };
 
-int ComputeFpgaConv(struct FpgaConvArgs);
-int ComputeFpgaPool(struct FpgaPoolArgs);
-int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
+int ComputeFpgaConv(struct FpgaConvArgs args);
+int ComputeFpgaPool(struct FpgaPoolArgs args);
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
 
-#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_FPGA_MAGIC 'CNN'
 #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
 #define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
-#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
 #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
 #define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
 #define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
 #define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
 
-}  // namespace api
 }  // namespace fpga
-}  // namespace mobile
-}  // namespace paddle
+}  // namespace paddle_mobile
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index f16a65c28fb47e1cf4139588742ebe1073c3f3e6..a5890d34c600f6c4f4838ec94c202801b3044d3f 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <map>
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
-#include "node.h"
+#include "framework/program/program-optimize/node.h"
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 56e6d6bf18740489c195a66db70331cbab42aeea..954a65a3605c4d0204890d9414aeb074371b0d69 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -16,14 +16,15 @@ limitations under the License. */
 
 #include <cstdint>
 #include <cstring>
+#include <fstream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
-#include "common/enforce.h"
 
-#include <fstream>
 #include "common/enforce.h"
+#include "common/types.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+      functor;
   size_t size = functor(type);
 
   PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 65f019d1e3c3f6f6bdb8a18a9ff99bb7ecb2012c..6b0af3454e0cb9c41633bd793b76250028644abe 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
     memcpy(&max_value, *data + sizeof(float), sizeof(float));
     *data += 2 * sizeof(float);
     const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = (uint8_t *)(*data);
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
     for (int k = 0; k < memory_size; ++k) {
       static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
     }
@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 
 template class Executor<CPU, Precision::FP32>;
-template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<FPGA, Precision::FP16>;
 
 }  // namespace paddle_mobile
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index 19d771ddd5884412624a0720368ecc80f92678ea..f767f3481c999a16da46e75e314e8ebcb54193fa 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #ifdef CONCAT_OP
 
-#include "concat_op.h"
+#include <vector>
+
+#include "operators/concat_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif
 
 #endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 7aedaab4b1fa00707661ada428c7c1dc27f124cd..bad0015917c2a9d4016def26c8a332d076b39c99 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
 USE_OP_MALI_GPU(concat);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(concat);
 #endif
 
 #endif
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c691988f4a388c7835a7016602d7a1ac9cb5f9b6
--- /dev/null
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, half>::Init(ConcatParam *param) {
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<half>();
+
+  DDim out_dim = out->dims();
+  int pixels = out_dim[1] * out_dim[2];
+  auto out_channel = out_dim[3];
+
+  auto out_offset = 0;
+
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto input = inputs[i];
+    auto channels = input[3];
+    out_offset += channels;
+    auto src = input->data<half>();
+    for (int j = 0; j < pixels; ++j) {
+      auto dst = out->data<half>() + out_offset;
+      memory::Copy(dst, src, sizeof(half));
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index e2795b3aefe3c67df9b51c882298a717a388ae15..a1c9baad79df159b1784ef0dd5d12ccf7ed7fe11 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api/fpga_api.h"
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
   Tensor *input_y_;
   Tensor *out_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaEWAddArgs fpga_EW_add_args;
+
+ public:
+  const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
+  void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; }
+#endif
 };
 
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
   vector<int> paddings_;
   bool ceil_mode_;
   bool global_pooling_ = false;
-};
-#endif
-
-#ifdef FUSION_POOLBN_OP
-class FusionPoolBNParam : OpParam {
- public:
-  FusionPoolBNParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    const Scope &scope) {
-    input_ = InputXFrom<LoDTensor>(inputs, scope);
-    pooling_type_ = GetAttr<string>("pooling_type", attrs);
-    ksize_ = GetAttr<vector<int>>("ksize", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
-    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
-    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
-  }
-  const Tensor *Input() const { return input_; }
-
-  const string &PoolingType() const { return pooling_type_; }
-
-  const vector<int> &Ksize() const { return ksize_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  bool isCeilMode() const { return ceil_mode_; }
-
-  bool isGlobalPooling() const { return global_pooling_; }
-
-  Tensor *OutputY() const { return output_y_; }
-
-  const Tensor *InputBias() const { return input_bias_; }
-
-  const Tensor *InputMean() const { return input_mean_; }
-
-  const Tensor *InputScale() const { return input_scale_; }
-
-  const Tensor *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  const string &DataFormat() const { return data_format_; }
+#ifdef PADDLE_MOBILE_FPGA
 
  private:
-  Tensor *input_;
-  string pooling_type_;
-  vector<int> ksize_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  bool ceil_mode_;
-  bool global_pooling_ = false;
-  Tensor *output_y_;
-  Tensor *input_bias_;
-  Tensor *input_mean_;
-  Tensor *input_scale_;
-  Tensor *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  string data_format_;
+  fpga::FpgaPoolArgs fpga_pool_args;
+
+ public:
+  const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; }
+  void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; }
+#endif
 };
 #endif
 
@@ -704,7 +651,7 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope &scope) {
+            const AttributeMap &attrs, Scope const &scope) {
     input_x_ = InputXFrom<LoDTensor>(inputs, scope);
     out_ = OutFrom<LoDTensor>(outputs, scope);
     auto var = scope.Var("batch_size");
@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
   int x_num_col_dims_;
   int y_num_col_dims_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 
 #ifdef FUSION_FCRELU_OP
@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
   bool is_test_;
   Tensor *new_bias_;
   Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
   bool is_test_;
   Tensor *new_bias_;
   Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::FpgaConvArgs fpga_conv_args;
+
+ public:
+  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
 };
 #endif
 
-#ifdef REGION_OP
-class RegionParam : public OpParam {};
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h
index 9e7fd6b8029aebfdf4b7c53439936189b0c8eb8e..6cab048dea350d668c92fda56f6b6b197c38093d 100644
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
@@ -33,7 +33,7 @@ class ResizeOp
           DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
  public:
   ResizeOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap attrs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<DeviceType, ResizeParam,
                                       operators::ResizeKernel<DeviceType, T>>(
diff --git a/test_gemm.cpp b/test_gemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a49193256d8293dc2cef559b1d1e73bc6dfc7bb
--- /dev/null
+++ b/test_gemm.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <cstdlib>
+#include <ctime>
+#include "../test_helper.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+
+void print_matirx(int m, int n, int ldc, float *c) {
+    for (int i = 0; i < m; ++i) {
+        std::cout << c(i, 0);
+        for (int j = 1; j < n; ++j) {
+            std::cout << " | " << c(i, j);
+        }
+        std::cout << std::endl;
+    }
+    std::cout << std::endl;
+}
+
+int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  float *a = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *c1 = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float* scale = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+  float* bias = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+  
+  srand(unsigned(time(0)));
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = t1 + rand() % t2; 
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = t1 + rand() % t2; 
+  }
+  for (int i = 0; i < m; ++i) {
+    scale[i] = t1 + rand() % t2; 
+  }
+  for (int i = 0; i < m; ++i) {
+    bias[i] = t1 + rand() % t2; 
+  }
+  
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float r = 0;
+      for (int p = 0; p < k; p++) {
+        r += a(i, p) * b(p, j);
+      }
+      r *= scale[i];
+      r += bias[i];
+      if (relu && (r < 0)) {
+        r = 0;
+      }
+      c1(i, j) = r;
+    }
+  }
+  
+  paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda,
+                 b, ldb, 0.3, c, ldc, relu, scale, bias);
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
+        ++eq;
+    } else {
+        ++neq;
+    }
+  }
+  
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+    
+  std::cout << "mnk=" << m << " " << n << " " << k << 
+    " relu=" << relu <<
+    "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+  paddle_mobile::memory::Free(scale);
+  paddle_mobile::memory::Free(bias);
+        
+  return 0;
+}
+
+int main() {
+    
+  do_sgemm(9, 9, 9, true, 10, 10, 10);
+  do_sgemm(10, 6, 12, false, 10, 10, 0);
+  do_sgemm(512, 256, 384, false, 10, 10, 0);
+  do_sgemm(1366, 768, 256, false, 10, 10, 0);
+  do_sgemm(1255, 755, 333, false, 10, 10, 0);
+  do_sgemm(555, 777, 999, false, 10, 10, 0);
+  
+  do_sgemm(10, 6, 12, true, -4, 10, 0);
+  do_sgemm(512, 256, 384, true, -4, 10, 0);
+  do_sgemm(1366, 768, 256, true, -4, 10, 0);
+  do_sgemm(1255, 755, 333, true, -4, 10, 0);
+  do_sgemm(555, 777, 999, true, -4, 10, 0);
+  return 0;
+}
diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake
index a57d9c102ff65d4c10cc9bd3773ffa4c87e482fa..55b90ba65260b99d9af4a29832ed6f8ff5b235c8 100644
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -65,6 +65,8 @@ endif()
 file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
 
 # Android NDK revision
+message("${ANDROID_NDK}")
+
 file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
 set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
   "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
@@ -159,7 +161,7 @@ endif()
 
 # Default values for configurable variables.
 if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN clang)
+  set(ANDROID_TOOLCHAIN gcc)
 endif()
 if(NOT ANDROID_ABI)
   set(ANDROID_ABI armeabi-v7a)
diff --git a/tools/build.sh b/tools/build.sh
index db809f71076e6b6d4aacc53bd8e144db3935cb91..ced18a180762826ffa2c45949e5aab9bfe5c8f88 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -40,8 +40,8 @@ build_for_android() {
     fi
 
     if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
-#        PLATFORM="arm-v8a"
+#        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+        PLATFORM="arm-v8a"
     fi
 
     if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -63,7 +63,7 @@ build_for_android() {
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
 
-    if [ "${#NETS}" > 1 ]; then
+    if [ "${#NETS}" -gt 1 ]; then
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -99,7 +99,7 @@ build_for_ios() {
     BUILD_DIR=../build/release/"${PLATFORM}"/
     TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
     mkdir -p "${BUILD_DIR}"
-    if [ "${#NETS}" > 1 ]; then
+    if [ "${#NETS}" -gt 1 ]; then
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
diff --git a/tools/op.cmake b/tools/op.cmake
index ec9768443c5e9825931111803acf1f51c1aa1acd..361381b81a603274207e50aeb8f0feddcff4e2ed 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
   set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDBN_OP ON)
-  set(FUSION_POOLBN_OP ON)
   set(FUSION_ELEMENTWISEADDRELU_OP ON)
   set(FUSION_FC_OP ON)
   set(FUSION_FCRELU_OP ON)
-  set(REGION_OP ON)
   set(POOL_OP ON)
   set(CONCAT_OP ON)
   set(SOFTMAX_OP ON)