Merge branch 'develop' into reshape2-dev

c76b22c2 · Ray Liu · GitHub · 01baa6e8 · a9f59fae · c76b22c2
16 changed file
--- a/README.md
+++ b/README.md
@@ -26,61 +26,10 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平

 - **ARM CPU**

-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟970(ms)|108.180|63.935|37.545|
-|麒麟960(ms)|108.588|63.073|36.822|
-|高通845(ms)|85.952|48.890|28.641|
-|高通835(ms)|105.434|62.752|37.131|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|212.686|127.205|77.485|
-|麒麟960(ms)|212.641|125.338|75.250|
-|高通845(ms)|182.863|95.671|56.857|
-|高通835(ms)|213.849|127.717|77.006|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|335.288|234.559|161.295|
-|麒麟960(ms)|354.443|232.642|157.815|
-|高通845(ms)|282.007|173.146|122.148|
-|高通835(ms)|341.250|233.354|158.554|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|83.726|57.944|36.923|
-|麒麟960(ms)|85.835|55.762|36.496|
-|高通845(ms)|71.301|41.618|28.785|
-|高通835(ms)|82.407|56.176|36.455|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|129.658|79.993|49.969|
-|麒麟960(ms)|130.208|78.791|48.390|
-|高通845(ms)|109.244|61.736|40.600|
-|高通835(ms)|130.402|80.863|50.359|
-
-    测试机型信息：
-    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
-    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
-    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
-    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
-    
 - **Mali GPU**

-    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
-
 - **苹果设备的GPU Metal实现**

-|mobilenetfssd|速度|
-|------------|-----|
-|A9(ms)|33.78|
-|A10(ms)|24.05|
-|A11(ms)|17.15|
-|||
-|genet|速度|
-|A9(ms) |3.49|
-|A10(ms)|2.54|
-|A11(ms)|1.43|
-
-
 - **FPGA**

    目前已经支持 ZCU102 开发板。

--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
@@ -27,8 +27,9 @@ ___
 ## 准备模型和数据
 ___
 1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
-2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
-3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse".
+2. 如果不存在，则创建文件夹./test/models/resnet50 和 ./test/images。
+3. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+4. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到./test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse"。

 ## 运行程序
 ___

--- a/doc/development_ios.md
+++ b/doc/development_ios.md
@@ -34,7 +34,7 @@ cd ../build/release/ios/build
 libpaddle-mobile.a

 /src/ios_io/ 下的
-PaddleMobile.h
+PaddleMobileCPU.h
 ```
 拖入工程


--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -16,7 +16,6 @@ limitations under the License. */

 #pragma once
 #include <vector>
-#include "operators/math/conv_arm_int8.h"
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
@@ -28,11 +27,12 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-template <typename Dtype>
+template <typename Itype, typename Otype>
 inline void ConvBasic(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
  int groups = param.Groups();
  const std::vector<int> strides = param.Strides();
  const std::vector<int> paddings = param.Paddings();
@@ -60,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<Dtype>(col_shape);
+    col.mutable_data<Itype>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
@@ -79,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  int in_step = static_cast<int>(input->dims()[1]) / groups;
  int out_step = static_cast<int>(output->dims()[1]) / groups;

-  math::Vol2ColFunctor<CPU, Dtype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;

  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -109,69 +109,18 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);

-      math::matmul<Dtype>(filter_slice, false, col_matrix, false,
+      math::matmul<Itype>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));
    }
  }
 }

-inline void ConvCompute_int8(const ConvParam<CPU> &param) {
-  typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
-                           Tensor *output);
-  static ConvFunc conv_funcs_table[7][5] = {
-      {0, 0, 0, 0, 0},                                // k = 1
-      {0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0},  // k = 3
-      {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0},  // k = 5
-      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 7
-  };
-  const Tensor *input = param.Input();
-  Tensor *filter = param.Filter();
-  Tensor *output = param.Output();
-  int groups = param.Groups();
-  const std::vector<int> &strides = param.Strides();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &dilations = param.Dilations();
-  int kernel_h = filter->dims()[2];
-  int kernel_w = filter->dims()[3];
-  output->mutable_data<int32_t>();
-
-  ConvFunc conv_func = 0;
-  if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
-      kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
-      dilations[1] == 1) {
-    conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
-  }
-  if (conv_func) {
-    int batch_size = input->dims()[0];
-    math::PadFunctor<CPU, int8_t> pad;
-
-    Tensor input_pad;
-    for (int i = 0; i < batch_size; ++i) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      if (paddings[0] == 0 && paddings[1] == 0) {
-        input_pad = in_batch;
-      } else {
-        framework::DDim pad_shape = in_batch.dims();
-        pad_shape[2] += 2 * paddings[0];
-        pad_shape[3] += 2 * paddings[1];
-        input_pad.mutable_data<int8_t>(pad_shape);
-        pad(in_batch, paddings[0], paddings[1], &input_pad);
-      }
-      conv_func(input_pad, *filter, &out_batch);
-    }
-  } else {
-    ConvBasic<int8_t>(param);
-  }
-}
-
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
  if (param.Input()->type() == typeid(int8_t)) {
-    ConvCompute_int8(param);
+    ConvBasic<int8_t, int32_t>(param);
  } else {
-    param.Output()->mutable_data<float>();
    if (param.Groups() == param.Input()->dims()[1] &&
        param.Input()->dims()[1] == param.Output()->dims()[1] &&
        param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
@@ -185,7 +134,7 @@ void ConvCompute(const ConvParam<CPU> &param) {
      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
                             param.Filter(), nullptr, param.Output(), false);
    } else {
-      ConvBasic<float>(param);
+      ConvBasic<float, float>(param);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
                                 Bias, false);

  } else {
-    ConvBasic<float>(param);
+    ConvBasic<float, float>(param);
  }
 }


--- a/src/operators/math/conv3x3_arm_int8.cpp
+++ b/src/operators/math/conv3x3_arm_int8.cpp
--- a/src/operators/math/conv5x5_arm_int8.cpp
+++ b/src/operators/math/conv5x5_arm_int8.cpp
--- a/src/operators/math/conv_arm_int8.h
+++ b/src/operators/math/conv_arm_int8.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void conv3x3s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight, framework::Tensor* output);
-
-void conv3x3s1_int8_4c(const framework::Tensor& input,
-                       const framework::Tensor& weight,
-                       framework::Tensor* output);
-
-void conv5x5s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight, framework::Tensor* output);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -209,12 +209,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                      int32_t lda, int8_t *buffer);
  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                      int32_t ldb, int8_t *buffer);
+  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                          int32_t lda, int8_t *buffer);
+  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                          int32_t ldb, int8_t *buffer);

  // 8 bits int matrix product
  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
             int32_t ldc, bool relu, int8_t *bias);
-
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias);
  // 8 bits int write back
  // C = alpha * A * B + beta * C
  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -30,7 +30,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -246,7 +246,7 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -546,8 +546,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
 #pragma omp parallel for
  for (int32_t j = 0; j < nc; j += NR) {
    for (int32_t i = 0; i < mc; i += MR_INT8) {
+#if __aarch64__
+    // TODO(wzzju)
+#else
      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif  // __aarch64__
    }
  }
  if (alpha != 1) {
@@ -682,7 +686,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
      const int8_t *b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-      // TODO
+      // TODO(wzzju)
 #else
      asm volatile(
          //          "pld        [%[b0]]                     \n\t"
@@ -791,7 +795,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                      int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
  int32_t nc1 = nc >> 4;
  int32_t _nc1 = nc & 15;

--- a/src/operators/math/gemm_omp_int8.cpp
+++ b/src/operators/math/gemm_omp_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string.h>
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+// 8 bits int matrix product (m*k x k*n)
+void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
+                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
+                     int8_t beta, int32_t *C, int32_t ldc, bool relu,
+                     int8_t *bias) {
+#ifdef _OPENMP
+  int32_t max_threads = omp_get_max_threads();
+#else
+  int32_t max_threads = 1;
+#endif
+
+  int32_t L1 = 64 / max_threads * 1024;
+  KC = k;
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(int8_t));
+    if (MC == 0) {
+      MC = MR_INT8;
+    } else {
+      int32_t mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
+#endif
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(int8_t));
+    if (NC == 0) {
+      NC = NR;
+    } else {
+      int32_t nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
+#endif
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
+  }
+  packedC_int8 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int32_t i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+
+      int32_t mc;
+      mc = s_min(m - i, MC);
+      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
+#endif
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
+                          &C(i, 0), ldc, relu, bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int32_t j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+      int32_t nc;
+      nc = s_min(n - j, NC);
+      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+#endif
+      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, bias);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int8);
+  paddle_mobile::memory::Free(zero_int8);
+}
+
+void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
+                              const int8_t *B, int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+#pragma omp parallel for
+  for (int32_t j = 0; j < j_length; j += NR) {
+    int8_t *local_buffer = buffer + j * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      asm volatile(
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.s8    {d0},   [%[b0]]         \n\t"
+          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j_length);
+      for (int32_t j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int32_t j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
+void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
+                              const int8_t *A, int32_t lda, int8_t *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    int8_t *local_buffer = buffer + i * k;
+    for (int32_t j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    int8_t *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -51,12 +51,23 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
      }
    }

+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
               matrix_out->data<int32_t>(), N, relu, bias);
+#endif
  } else {
+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                   matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
               relu, bias);
+#endif
  }
 }
 }  // namespace math

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "common/log.h"
 #include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP

 #define a(i, j) a[(i)*lda + (j)]
 #define b(i, j) b[(i)*ldb + (j)]
@@ -84,8 +87,13 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
  }

  paddle_mobile::operators::math::Gemm gemm;
+#ifdef _OPENMP
+  gemm.Sgemm_omp(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
+                 static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#else
  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
             static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#endif
  int eq = 0;
  int neq = 0;
  for (int i = 0; i < m * n; ++i) {
@@ -119,12 +127,17 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
 }

 int main() {
-  do_sgemm(9, 9, 9, false, 10);
+#ifdef _OPENMP
+  omp_set_num_threads(8);
+#endif
+  do_sgemm(9, 9, 9, false, 1);
  do_sgemm(10, 6, 12, false, 0);
  do_sgemm(512, 256, 384, false, 0);
  do_sgemm(1366, 768, 256, false, 0);
  do_sgemm(1255, 755, 333, false, 0);
-  do_sgemm(555, 777, 999, false, 0);
+  do_sgemm(599, 1133, 393, false, 0);
+  do_sgemm(777, 555, 999, false, 0);
+  do_sgemm(333, 797, 939, false, 0);
  do_sgemm(1024, 1024, 1024, false, 0);

  return 0;

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,7 +28,7 @@ limitations under the License. */

 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
+  paddle_mobile.SetThreadNum(8);
  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});

--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -93,6 +93,8 @@ int TestMulOP() {
 }  // namespace paddle_mobile

 int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(8);
  paddle_mobile::TestMulOP<int8_t, int32_t>();
  paddle_mobile::TestMulOP<float, float>();
  return 0;

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -160,7 +160,7 @@ build_for_ios() {
    fi
    cd "${BUILD_DIR}"
    make -j 8
-    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
    cd ./build
    # 生成符号表
    ranlib *.a