Merge branch 'develop' into dev-latest

8aac7d9d · xiebaiyuan · GitHub · 23858ac2 · a2ab5734 · 8aac7d9d
29 changed file
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -57,7 +57,12 @@ class RawData {
 public:
  char data[size];
  RawData() {}
-  RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
+  RawData(const RawData &raw_data) { memcpy(data, raw_data.data, size); }
+
+  RawData &operator=(const RawData &raw_data) {
+    memcpy(data, raw_data.data, size);
+    return *this;
+  }
 };

 template <typename... Ts>
@@ -74,14 +79,36 @@ struct Variant {

  template <typename T, typename... Args>
  void Set(Args &&... args) {
-    helper::Destroy(type_id, &data);
-    new (&data) T(std::forward<Args>(args)...);
+    helper::Destroy(type_id, &data.data);
+    new (&data.data) T(std::forward<Args>(args)...);
    type_id = typeid(T).hash_code();
  }

+  void SetString(std::string &string) {
+    //    helper::Destroy(type_id, &data);
+    type_id = typeid(std::string).hash_code();
+    strcpy(data.data, string.c_str());
+  }
+
+  std::string GetString() const {
+    if (type_id == typeid(std::string).hash_code()) {
+      return std::string(data.data);
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " bad cast in variant data type not a string ");
+      exit(0);
+    }
+  }
+
  template <typename T>
  T &Get() const {
-    if (type_id == typeid(T).hash_code()) {
+    if (type_id == typeid(std::string).hash_code()) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "Please use getString to get an string (to avoid of an issue with "
+          "gcc "
+          "stl lib with string copy)");
+      exit(0);
+    } else if (type_id == typeid(T).hash_code()) {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -104,7 +104,7 @@ int fpga_invalidate(void *address, size_t size) {
 }

 half fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
           (((tmp & 0x7f800000) >> 13) - (112 << 10));
  if (tmp & 0x1000) {
@@ -120,7 +120,7 @@ float fp16_2_fp32(half fp16_num) {
  int tmp = 0;
  float fp32_num;
  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;
+  fp32_num = *(float *)&tmp;  // NOLINT
  return fp32_num;
 }

@@ -347,6 +347,20 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
  filter_tensor->reset_data_ptr(new_data);
 }

+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
+                           max_value);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
 void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -109,8 +109,8 @@ struct PoolingArgs {
 struct EWAddArgs {
  bool relu_enabled;

-  half const0;  // output0 = const0 x input0 + const1 x input1;
-  half const1;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
  struct ImageInputArgs image0;
  struct ImageInputArgs image1;
  struct ImageOutputArgs output;
@@ -214,6 +214,7 @@ int get_aligned_filter_element_num(int chw);
 int get_aligned_filter_num(int num);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                   int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,

--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -225,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
                                 num_after_alignment * sizeof(char));
 }

+void convert_fc_filter(char **data_in, int num, int chw) {
+  char *tmp = *data_in;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment * div_num;
+
+  quantize(data_in, data_size, max);
+
+  char **quantize_data = (char **)data_in;  // NOLINT
+
+  convert_fc_filter(quantize_data, num, chw);
+  align_element(quantize_data, num, chw);
+  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
@@ -25,7 +25,7 @@ int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(float** data_in, int num, int channel, int height,
+void convert_to_hwc(char** data_in, int num, int channel, int height,
                    int width);
 float find_max(float* data_in, int data_size);
 void quantize(float** data_in, int data_size, float max);
@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw);
 void interleave(float** data_in, int num_after_alignment, int chw);
 void format_filter(float** data_in, int num, int channel, int height, int width,
                   int group_num, float max);
+
+void convert_fc_filter(char** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -51,7 +51,7 @@ class Attribute {
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
-        attr.Set<std::string>(std::string(attr_desc->s));
+        attr.SetString(std::string(attr_desc->s));
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
@@ -108,6 +108,13 @@ class Attribute {
    return variant_.Get<T>();
  }

+  Attribute &SetString(std::string string) {
+    variant_.SetString(string);
+    return *this;
+  }
+
+  std::string GetString() const { return variant_.GetString(); }
+
  template <typename Vistor>
  static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
    if (attr.variant_.TypeId() == typeid(int).hash_code()) {
@@ -115,7 +122,7 @@ class Attribute {
    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
      return vistor(attr.variant_.Get<float>());
    } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
-      return vistor(attr.variant_.Get<string>());
+      return vistor(attr.variant_.GetString());
    } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
      return vistor(attr.variant_.Get<vector<int>>());
    } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <cstdlib>
 #include <initializer_list>
+#include <string>
 #include <typeinfo>
 #include <vector>


--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -33,6 +33,13 @@ class Variable {

  template <typename T>
  const T GetValue() const {
+    if (typeid(T) == typeid(std::string)) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "Please use getString to get an string (to avoid of an issue with "
+          "gcc "
+          "stl lib with string copy)");
+      exit(0);
+    }
    return variant.Get<T>();
  }


--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
  return true;
 }

+template <typename Dtype, Precision P>
+PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
+  paddle_mobile_->Clear();
+}
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor>

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -32,7 +32,7 @@ namespace paddle_mobile {
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class PaddleMobilePredictor : public PaddlePredictor {
 public:
-  PaddleMobilePredictor() {}
+  PaddleMobilePredictor() = delete;

  explicit PaddleMobilePredictor(const PaddleMobileConfig& config);

@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
           std::vector<PaddleTensor>* output_data,
           int batch_size = -1) override;

-  ~PaddleMobilePredictor() override{};
+  ~PaddleMobilePredictor() override;

 private:
  std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;

--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -87,7 +87,6 @@ enum class PaddleEngineKind {
 class PaddlePredictor {
 public:
  struct Config;
-  PaddlePredictor() = default;
  PaddlePredictor(const PaddlePredictor&) = delete;
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;

@@ -107,6 +106,9 @@ class PaddlePredictor {
  struct Config {
    std::string model_dir;  // path to the model directory.
  };
+
+ protected:
+  PaddlePredictor() = default;
 };

 struct PaddleMobileConfig : public PaddlePredictor::Config {

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -46,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {

  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, 1);
+  fpga::format_fc_filter(filter, max_value);

  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {

  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
  float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, 1);
+  fpga::format_fc_filter(filter, max_value);

  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);

--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ b/src/operators/kernel/fpga/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#include "operators/kernel/mul_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
+  auto out = param->Out();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = 0;
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+                      0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -35,146 +35,166 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-/*
+class Gemm {
+ public:
+  /*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-                 float *buffer);
+           float *buffer);

 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                 float *buffer);
+           float *buffer);
 */
-
-// 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
-void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
-
-// 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                    float *buffer);
-void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer);
-void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer);
-void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                        float *buffer);
-void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
-void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
-
-// 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 float beta, float *c, float *C, int ldc, bool relu);
-void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
+  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
+                                 int);
+  FnPack procPackA;
+  FnPack procPackB;
+  FnAddDot procAddDot;
+
+  // 将 A 矩阵分块复制到连续内存(RowMajor)
+  void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer);
+  void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer);
+
+  // 将 B 矩阵分块复制到连续内存(RowMajor)
+  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                      float *buffer);
+  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                       float *buffer);
+  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                       float *buffer);
+  void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                          float *buffer);
+  void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer);
+  void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer);
+
+  // 分块矩阵乘法
+  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                   float beta, float *c, float *C, int ldc, bool relu);
+  void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                           const float *b, float beta, float *c, float *C,
+                           int ldc, bool relu, float *bias);
+
+  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                         const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *bias);
-
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
-                       bool relu, float *new_scale, float *new_bias);
-void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                          const float *b, float beta, float *c, float *C,
-                          int ldc, bool relu, float *new_scale, float *new_bias,
+                         int ldc, bool relu, float *new_scale, float *new_bias);
+  void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+                            const float *b, float beta, float *c, float *C,
+                            int ldc, bool relu, float *new_scale,
+                            float *new_bias, float *bias);
+  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                            float *c, float *C, int ldc, float *p,
+                            std::string mode, float *bias, float *bias1);
+  /*
+  // 向量矩阵乘法 (M = 1)
+  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                    const float *B, int ldb, float beta, float *C, int ldc,
+                    bool relu);
+
+  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                          int lda, const float *B, int ldb, float beta, float
+  *C, int ldc, bool relu, float *new_scale, float *new_bias);
+  */
+
+  // 计算一个更小的 C 矩阵分块
+  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+
+  // 分块矩阵乘法结果回写
+  // C = A * B
+  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C
+  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C,prelu(C)
+  void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                         std::string mode, float *bias, float *bias1);
+  // C = A * B + bias ,relu(C)
+  void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
                          float *bias);
-void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                          float *c, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1);
-/*
-// 向量矩阵乘法 (M = 1)
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc,
-                  bool relu);
-
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                        int lda, const float *B, int ldb, float beta, float *C,
-                        int ldc, bool relu, float *new_scale, float *new_bias);
-*/
+  // C = A * B, batchnorm(C)
+  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+                   float *new_scale, float *new_bias);
+  // C = A * B, batchnorm(C), relu(C)
+  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                       float *new_scale, float *new_bias);
+  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                          float *new_scale, float *new_bias, float *bias1);
+  /*
+  // 向量矩阵乘法结果回写
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                      float *new_bias);
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                          float *new_bias);
+  */
+
+  // 32位 float 矩阵乘法
+  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+             float *bias);
+
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
+  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                   const float *B, int ldb, float beta, float *C, int ldc,
+                   bool relu, float *new_scale, float *new_bias, float *bias);
+  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                      const float *B, int ldb, float *C, int ldc, float *p,
+                      std::string mode, float *bias, float *bias1);
+
+  // 32位 float 矩阵乘法（openmp 多线程版本）
+  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *bias);

-// 计算一个更小的 C 矩阵分块
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
-
-// 分块矩阵乘法结果回写
-// C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
-// C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C,prelu(C)
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1);
-// C = A * B + bias ,relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias);
-// C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
-                 float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias);
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias1);
-/*
-// 向量矩阵乘法结果回写
-// C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-// C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-// C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                    float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                        float *new_bias);
-*/
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
+  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
+                       int lda, const float *B, int ldb, float beta, float *C,
+                       int ldc, bool relu, float *new_scale, float *new_bias,
+                       float *bias);
+
+  void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                          const float *B, int ldb, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1);

-// 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-           float *bias);
+ private:
+  int MC = 0;
+  int KC = 0;
+  int NC = 0;

-// 32位 float 矩阵乘法, 并对结果进行 batchnrom
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *new_scale, float *new_bias, float *bias);
-void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                    const float *B, int ldb, float *C, int ldc, float *p,
-                    std::string mode, float *bias, float *bias1);
-
-// 32位 float 矩阵乘法（openmp 多线程版本）
-void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-               const float *B, int ldb, float beta, float *C, int ldc,
-               bool relu, float *bias);
-
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *new_scale, float *new_bias, float *bias);
-
-void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                        const float *B, int ldb, float *C, int ldc, float *p,
-                        std::string mode, float *bias, float *bias1);
+  float *packedA;
+  float *packedB;
+  float *packedC;
+  float *zero;
+};

 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
                      const ActivationType active_node,
                      const ActivationType active_gate) {
+    Gemm gemm;
    if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value,
-            frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value,
-            frame_size * 3, false, nullptr);
+      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
+                 value.prev_out_value, frame_size, value.gate_weight,
+                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
+                 nullptr);
    }

    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
                         batch_size, active_gate);

    if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value,
-            frame_size, value.state_weight, frame_size, 1,
-            value.gate_value + frame_size * 2, frame_size * 3, false, nullptr);
+      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
+                 value.reset_output_value, frame_size, value.state_weight,
+                 frame_size, 1, value.gate_value + frame_size * 2,
+                 frame_size * 3, false, nullptr);
    }

    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;

  if (trans_a) {
    int numel = matrix_a.numel();
@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
        a[index++] = tmp[i * n + j];
      }
    }
+
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-              matrix_out->data<float>(), N, relu, bias);
+
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+                   matrix_out->data<float>(), N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-          matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+               matrix_out->data<float>(), N, relu, bias);
 #endif
  } else {
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-              N, beta, matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
+                   N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
+               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+               relu, bias);
 #endif
  }
 }
@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                         float alpha, framework::Tensor *matrix_out, float beta,
                         bool relu, framework::Tensor *new_scale,
                         framework::Tensor *new_bias, int group, float *bias) {
+  Gemm gemm;
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  int K = (!trans_a) ? dim_a[1] : dim_a[0];

 #ifdef _OPENMP
-  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
-                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
-                  relu, new_scale->data<float>() + group,
-                  new_bias->data<float>() + group, bias);
+  gemm.SgemmWithBn_omp(
+      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+      beta, matrix_out->data<float>(), N, relu,
+      new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
 #else
-  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-              N, beta, matrix_out->data<float>(), N, relu,
-              new_scale->data<float>() + group, new_bias->data<float>() + group,
-              bias);
+  gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
+                   N, relu, new_scale->data<float>() + group,
+                   new_bias->data<float>() + group, bias);
 #endif
 }
 void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,
                     framework::Tensor *matrix_out, float *p, std::string mode,
                     float *bias, float *bias1) {
+  Gemm gemm;
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
  int K = (!trans_a) ? dim_a[1] : dim_a[0];

 #ifdef _OPENMP
-  SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(),
-                     N, matrix_out->data<float>(), N, p, mode, bias, bias1);
+  gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
+                          matrix_b.data<float>(), N, matrix_out->data<float>(),
+                          N, p, mode, bias, bias1);
 #else
-  SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-                 matrix_out->data<float>(), N, p, mode, bias, bias1);
+  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
+                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
+                      p, mode, bias, bias1);

 #endif
 }

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -61,5 +61,7 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
-
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
+#endif
 #endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -263,6 +263,10 @@ class OpParam {
  static const T GetAttr(const string &key, const AttributeMap &map) {
    return ((Attribute)map.at(key)).Get<T>();
  }
+  static const std::string GetStringAttr(const string &key,
+                                         const AttributeMap &map) {
+    return ((Attribute)map.at(key)).GetString();
+  }

  static const bool HasAttr(const string &key, const AttributeMap &map) {
    return map.count(key) > 0;
@@ -438,6 +442,15 @@ class MulParam : OpParam {
  GType *out_;
  int x_num_col_dims_;
  int y_num_col_dims_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif

@@ -493,7 +506,7 @@ class LrnParam : public OpParam {
    alpha_ = GetAttr<float>("alpha", attrs);
    beta_ = GetAttr<float>("beta", attrs);
    k_ = GetAttr<float>("k", attrs);
-    data_format_ = GetAttr<string>("data_format", attrs);
+    data_format_ = GetStringAttr("data_format", attrs);
  }

  const RType *InputX() const { return input_x_; }
@@ -590,7 +603,7 @@ class PoolParam : public OpParam {
    input_ = InputXFrom<GType>(inputs, scope);

    output_ = OutFrom<GType>(outputs, scope);
-    pooling_type_ = GetAttr<string>("pooling_type", attrs);
+    pooling_type_ = GetStringAttr("pooling_type", attrs);
    ksize_ = GetAttr<vector<int>>("ksize", attrs);
    strides_ = GetAttr<vector<int>>("strides", attrs);
    paddings_ = GetAttr<vector<int>>("paddings", attrs);
@@ -724,7 +737,7 @@ class BoxCoderParam : public OpParam {
    input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope);
    input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope);
    output_box_ = OutputBoxFrom<GType>(outputs, scope);
-    code_type_ = GetAttr<std::string>("code_type", attrs);
+    code_type_ = GetStringAttr("code_type", attrs);
  }
  const RType *InputPriorBox() const { return input_priorbox_; }

@@ -1199,7 +1212,7 @@ class PReluParam : public OpParam {
    alpha_ = InputAlphaFrom<GType>(inputs, scope);
    framework::DDim dims = alpha_->dims();
    out_ = OutFrom<GType>(outputs, scope);
-    mode_ = GetAttr<std::string>("mode", attrs);
+    mode_ = GetStringAttr("mode", attrs);
    DLOG << "PReluParam mode after" << mode_;
  }
  const RType *InputX() const { return input_x_; }
@@ -1330,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
                          const AttributeMap &attrs, const Scope &scope)
      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
-    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    mode_ = OpParam::GetStringAttr("mode", attrs);
    framework::DDim dims = alpha_->dims();
    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
    axis_ = OpParam::GetAttr<int>("axis", attrs);
@@ -1373,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
    bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
-    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    mode_ = OpParam::GetStringAttr("mode", attrs);
    framework::DDim dims = alpha_->dims();
    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
    output_ = OpParam::OutFrom<GType>(outputs, scope);
@@ -1980,8 +1993,8 @@ class GruParam : public OpParam {
        OutputBatchResetHiddenPrevFrom<GType>(outputs, scope);
    output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope);
    output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
-    activation_ = GetAttr<std::string>("activation", attrs);
-    gate_activation_ = GetAttr<std::string>("gate_activation", attrs);
+    activation_ = GetStringAttr("activation", attrs);
+    gate_activation_ = GetStringAttr("gate_activation", attrs);
    is_reverse_ = GetAttr<bool>("is_reverse", attrs);
  }
  const GType *InputInput() const { return input_input_; }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -35,8 +35,8 @@ if (CON GREATER -1)
    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
    # gen test
-    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test_yolo_combined paddle-mobile)
+    ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo-combined paddle-mobile)
    set(FOUND_MATCH ON)

 endif ()
@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
    target_link_libraries(test-fssd paddle-mobile)


+    # gen test
+    ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
+    target_link_libraries(test-multi-process paddle-mobile)
+
+
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
    }
  }

-  paddle_mobile::operators::math::SgemmWithBn(
-      m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
+  paddle_mobile::operators::math::Gemm gemm;
+  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+                   nullptr);
  int eq = 0;
  int neq = 0;
  for (int i = 0; i < m * n; ++i) {

--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50";
 int main() {
  DLOG << paddle_mobile::fpga::open_device();
  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
-                         std::string(g_resnet_combine) + "/params", true)) {
+  //  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+  //                         std::string(g_resnet_combine) + "/params", true)) {
+  if (paddle_mobile.Load(std::string(g_resnet_combine), true)) {
    std::vector<int64_t> dims{1, 3, 224, 224};
    Tensor input_tensor;
    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),

--- a/test/framework/test_inference_api.cpp
+++ b/test/framework/test_inference_api.cpp
@@ -46,7 +46,12 @@ int main() {
  tensor_out.dtype = PaddleDType::FLOAT32;
  std::vector<PaddleTensor> outputs(1, tensor_out);

-  assert(predictor->Run(paddle_tensor_feeds, &outputs));
+  std::cout << " before predict " << std::endl;
+
+  predictor->Run(paddle_tensor_feeds, &outputs);
+
+  std::cout << " after predict " << std::endl;
+  //  assert();

  float* data_o = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {

--- a/test/net/test_multi_inference_predict.cpp
+++ b/test/net/test_multi_inference_predict.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <thread>  // NOLINT
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void fun_yolo();
+int fun_mobilenet();
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
+
+  //  fun_yolo();
+  //  fun_mobilenet();
+
+  std::thread t1(fun_yolo);
+  std::thread t2(fun_mobilenet);
+
+  t1.join();
+  t2.join();
+
+  return 0;
+}
+
+void fun_yolo() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    vector<float> input(input_tensor.data<float>(),
+                        input_tensor.data<float>() + input_tensor.numel());
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "thread 1:   predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+}
+
+int fun_mobilenet() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    vector<float> input;
+    vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto biggest = max_element(begin(vec_result), end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << distance(begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "thread 2:  predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_nlp.cpp
+++ b/test/net/test_nlp.cpp
@@ -60,7 +60,15 @@ int main() {
  std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
  //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479

-  std::vector<int64_t> ids{1791, 656, 1549, 281, 96};
+  std::vector<int64_t> ids{
+      2084, 635,  1035, 197,  990,  150,  1132, 2403, 546,  770,  4060, 3352,
+      1798, 1589, 1352, 98,   136,  3461, 3186, 1159, 515,  764,  278,  1178,
+      5044, 4060, 943,  932,  463,  1198, 3352, 374,  1198, 3352, 374,  2047,
+      1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635,  3087, 2236, 546,
+      2047, 1549, 546,  2047, 302,  2202, 398,  804,  397,  657,  804,  866,
+      932,  2084, 515,  2165, 397,  302,  2202, 526,  992,  906,  1215, 1589,
+      4493, 2403, 723,  932,  2084, 635,  1352, 932,  444,  2047, 1159, 1893,
+      1579, 59,   330,  98,   1296, 1159, 3430, 738,  3186, 1071, 2174, 3933};

  paddle_mobile::framework::LoDTensor words;
  auto size = static_cast<int>(ids.size());

--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -52,8 +52,8 @@ int main() {
 #else
    auto time3 = time();
    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(10);
-    paddle_mobile.Predict_From(10);
+    paddle_mobile.Predict_To(-1);
+    /*paddle_mobile.Predict_From(10);
    auto tensor_ptr = paddle_mobile.FetchResult(9);
    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
              << std::endl;
@@ -63,7 +63,7 @@ int main() {

    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
+              << std::endl;*/
 #endif
  }
  return 0;

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -46,7 +46,7 @@ class TestBoxCoderOp {
          DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
          DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
          DLOG << " code_type : "
-               << op->GetAttrMap().at("code_type").Get<std::string>();
+               << op->GetAttrMap().at("code_type").GetString();
          std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
              std::make_shared<operators::BoxCoderOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -121,6 +121,7 @@ if (CON GREATER -1)
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBN_OP ON)
  set(FUSION_CONVADD_OP ON)
+  set(MUL_OP ON)

  set(FOUND_MATCH ON)
 endif()