Merge pull request #1083 from hjchen2/dev-latest

Fix a bug while loading model

Merge pull request #1083 from hjchen2/dev-latest
Fix a bug while loading model
f756739e · xiebaiyuan · hjchen2 · 55fbecfe · 907437cd · f756739e
8 changed file
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -341,7 +341,9 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
    } else if (tensor.type() == typeid(int8_t)) {
-      printer << tensor.data<int8_t>()[i] << " ";
+      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    }
  }
 #endif

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -112,7 +112,9 @@ void Executor<Dtype, P>::LoadMemory(
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
  *data_buf += sizeof(uint32_t);
  // lod information
-  uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
+  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
+  uint64_t lod_level = 0;
+  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
  *data_buf += sizeof(uint64_t);

  auto *lod = tensor->mutable_lod();

--- a/src/operators/kernel/central-arm-func/conv3x3_arm_int8.cpp
+++ b/src/operators/kernel/central-arm-func/conv3x3_arm_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/central-arm-func/conv_arm_int8.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv3x3s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight,
+                    framework::Tensor* output) {
+  // TODO(hjchen2)
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/conv5x5_arm_int8.cpp
+++ b/src/operators/kernel/central-arm-func/conv5x5_arm_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/central-arm-func/conv_arm_int8.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv5x5s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight,
+                    framework::Tensor* output) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  const int8_t* in_data = input.data<int8_t>();
+  const int8_t* w_data = weight.data<int8_t>();
+  int32_t* out_data = output->mutable_data<int32_t>();
+  // make sure that batch size is 1
+  int input_c = input.dims()[1];
+  int input_h = input.dims()[2];
+  int input_w = input.dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+  int image_size = input_h * input_w;
+  int out_image_size = output_h * output_w;
+  memset(out_data, 0, output_c * out_image_size * sizeof(int32_t));
+
+  for (int oc = 0; oc < output_c; ++oc) {
+    for (int ic = 0; ic < input_c; ++ic) {
+      const int8_t* kernel = w_data + (oc * input_c + ic) * 25;
+      int32_t* output0 = out_data;
+      int32_t* output1 = out_data + output_w;
+      // load kernel
+      asm volatile("vld1.8    {d0-d3}, [%0]  \n"
+                   : "=r"(kernel)
+                   :  // no output
+                   : "memory", "q0", "q1");
+      int oh = 0;
+      for (; oh < output_h - 1; oh += 2) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+        const int8_t* r4 = r3 + input_w;
+        const int8_t* r5 = r4 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
+              "add        %[r0], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[0]               \n"
+              "vdup.s8    d11, d0[1]               \n"
+              "vdup.s8    d12, d0[2]               \n"
+              "vdup.s8    d13, d0[3]               \n"
+              "vdup.s8    d14, d0[4]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q14, d16, d18            \n"
+              "vaddl.s16  q15, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q14, q14, d16            \n"
+              "vaddw.s16  q15, q15, d17            \n"
+
+              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
+              "add        %[r1], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d16, d18            \n"
+              "vaddl.s16  q11, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q10, q10, d16            \n"
+              "vaddw.s16  q11, q11, d17            \n"
+
+              "vdup.s8    d10, d0[5]               \n"
+              "vdup.s8    d11, d0[6]               \n"
+              "vdup.s8    d12, d0[7]               \n"
+              "vdup.s8    d13, d1[0]               \n"
+              "vdup.s8    d14, d1[1]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
+              "add        %[r2], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d1[2]               \n"
+              "vdup.s8    d11, d1[3]               \n"
+              "vdup.s8    d12, d1[4]               \n"
+              "vdup.s8    d13, d1[5]               \n"
+              "vdup.s8    d14, d1[6]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
+              "add        %[r3], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d1[7]               \n"
+              "vdup.s8    d11, d2[0]               \n"
+              "vdup.s8    d12, d2[1]               \n"
+              "vdup.s8    d13, d2[2]               \n"
+              "vdup.s8    d14, d2[3]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
+              "add        %[r4], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d2[4]               \n"
+              "vdup.s8    d11, d2[5]               \n"
+              "vdup.s8    d12, d2[6]               \n"
+              "vdup.s8    d13, d2[7]               \n"
+              "vdup.s8    d14, d3[0]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output0]]  \n"
+              "vadd.s32   q12, q12, q14            \n"
+              "vadd.s32   q13, q13, q15            \n"
+              "vst1.32    {d24-d27}, [%[output0]]! \n"
+
+              "vld1.8     {d4-d5}, [%[r5]]         \n"  // row 5
+              "add        %[r5], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output1]]  \n"
+              "vadd.s32   q12, q12, q10            \n"
+              "vadd.s32   q13, q13, q11            \n"
+              "vst1.32    {d24-d27}, [%[output1]]! \n"
+
+              "subs       %[ow], #1                \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [r5] "+r"(r5), [ow] "+r"(ow),
+                [output0] "+r"(output0), [output1] "+r"(output1)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        }
+        if (remain > 0) {
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "vld1.8     d8, [%[r4]]              \n"
+              "vld1.8     d9, [%[r5]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "add        %[r4], #1                \n"
+              "add        %[r5], #1                \n"
+              "vext.8     d10, d0, d1, #5          \n"
+              "vext.8     d11, d1, d2, #2          \n"
+              "vext.8     d12, d1, d2, #7          \n"
+              "vext.8     d13, d2, d3, #4          \n"
+
+              "vmull.s8   q7, d4, d0               \n"
+              "vmull.s8   q8, d5, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output0]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output0]]!    \n"
+
+              "vmull.s8   q7, d5, d0               \n"
+              "vmull.s8   q8, d6, d10              \n"
+              "vmull.s8   q9, d7, d11              \n"
+              "vmlal.s8   q8, d8, d12              \n"
+              "vmlal.s8   q9, d9, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output1]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output1]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [r5] "+r"(r5), [remain] "+r"(remain),
+                [output0] "+r"(output0), [output1] "+r"(output1)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r6");
+        }
+        output0 += output_w;
+        output1 += output_w;
+      }
+      // remain output height
+      for (; oh < output_h; ++oh) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+        const int8_t* r4 = r3 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
+              "add        %[r0], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[0]               \n"
+              "vdup.s8    d11, d0[1]               \n"
+              "vdup.s8    d12, d0[2]               \n"
+              "vdup.s8    d13, d0[3]               \n"
+              "vdup.s8    d14, d0[4]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q14, d16, d18            \n"
+              "vaddl.s16  q15, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q14, q14, d16            \n"
+              "vaddw.s16  q15, q15, d17            \n"
+
+              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
+              "add        %[r1], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[5]               \n"
+              "vdup.s8    d11, d0[6]               \n"
+              "vdup.s8    d12, d0[7]               \n"
+              "vdup.s8    d13, d1[0]               \n"
+              "vdup.s8    d14, d1[1]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
+              "add        %[r2], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d1[2]               \n"
+              "vdup.s8    d11, d1[3]               \n"
+              "vdup.s8    d12, d1[4]               \n"
+              "vdup.s8    d13, d1[5]               \n"
+              "vdup.s8    d14, d1[6]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
+              "add        %[r3], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d1[7]               \n"
+              "vdup.s8    d11, d2[0]               \n"
+              "vdup.s8    d12, d2[1]               \n"
+              "vdup.s8    d13, d2[2]               \n"
+              "vdup.s8    d14, d2[3]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
+              "add        %[r4], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d2[4]               \n"
+              "vdup.s8    d11, d2[5]               \n"
+              "vdup.s8    d12, d2[6]               \n"
+              "vdup.s8    d13, d2[7]               \n"
+              "vdup.s8    d14, d3[0]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output0]]  \n"
+              "vadd.s32   q12, q12, q14            \n"
+              "vadd.s32   q13, q13, q15            \n"
+              "vst1.32    {d24-d27}, [%[output0]]! \n"
+
+              "subs       %[ow], #1                \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [ow] "+r"(ow), [output0] "+r"(output0)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        }
+        if (remain > 0) {
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "vld1.8     d8, [%[r4]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "add        %[r4], #1                \n"
+              "vext.8     d10, d0, d1, #5          \n"
+              "vext.8     d11, d1, d2, #2          \n"
+              "vext.8     d12, d1, d2, #7          \n"
+              "vext.8     d13, d2, d3, #4          \n"
+
+              "vmull.s8   q7, d4, d0               \n"
+              "vmull.s8   q8, d5, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output0]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output0]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [remain] "+r"(remain), [output0] "+r"(output0)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r6");
+        }
+      }
+    }
+    out_data += out_image_size;
+  }
+#else
+// TODO(hjchen2)
+#endif
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -16,15 +16,18 @@ limitations under the License. */

 #pragma once
 #include <vector>
+#include "operators/kernel/central-arm-func/conv_arm_int8.h"
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
+#include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
+
 inline void ConvBasic(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
@@ -111,6 +114,57 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  }
 }

+inline void ConvBasic_int8(const ConvParam<CPU> &param) {
+  typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
+                           Tensor *output);
+  static ConvFunc conv_funcs_table[7][5] = {
+      {0, 0, 0, 0, 0},                                // k = 1
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 3
+      {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0},  // k = 5
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 7
+  };
+
+  const Tensor *input = param.Input();
+  Tensor *filter = param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<int32_t>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  math::PadFunctor<CPU, int8_t> pad;
+
+  Tensor input_pad;
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    if (paddings[0] == 0 && paddings[1] == 0) {
+      input_pad = in_batch;
+    } else {
+      framework::DDim pad_shape = in_batch.dims();
+      pad_shape[2] += 2 * paddings[0];
+      pad_shape[3] += 2 * paddings[1];
+      input_pad.mutable_data<int8_t>(pad_shape);
+      pad(in_batch, paddings[0], paddings[1], &input_pad);
+    }
+    // int8 only used while dilation==1 and groups==1
+    if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
+        kernel_h < 8 && dilations[0] == 0 && dilations[1] == 0 && groups == 1) {
+      ConvFunc conv_func = conv_funcs_table[kernel_h - 1][strides[1] - 1];
+      if (!conv_func) {
+        conv_func(input_pad, *filter, &out_batch);
+      } else {
+        // TODO(hjchen2)
+      }
+    } else {
+      // TODO(hjchen2)
+    }
+  }
+}
+
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
  if (param.Groups() == param.Input()->dims()[1] &&
@@ -126,7 +180,11 @@ void ConvCompute(const ConvParam<CPU> &param) {
    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
                           param.Filter(), nullptr, param.Output(), false);
  } else {
-    ConvBasic(param);
+    if (param.Input()->type() == typeid(int8_t)) {
+      ConvBasic_int8(param);
+    } else {
+      ConvBasic(param);
+    }
  }
 }


--- a/src/operators/kernel/central-arm-func/conv_arm_int8.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_int8.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv3x3s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+
+void conv5x5s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/pad.cpp
+++ b/src/operators/math/pad.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/math/pad.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PadFunctor<CPU, T> {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output) {
+    const T *in_data = input.data<T>();
+    T *out_data = output->mutable_data<T>();
+    const framework::DDim &input_shape = input.dims();
+    const framework::DDim &output_shape = output->dims();
+    // fill output with 0
+    memset(out_data, 0, sizeof(T) * output->numel());
+    // should make sure the shape of output is match with input
+    for (int i = 0; i < input_shape[0]; ++i) {
+      for (int c = 0; c < input_shape[1]; ++c) {
+        out_data += pad_h * output_shape[3];
+        for (int h = 0; h < input_shape[2]; ++h) {
+          memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
+          out_data += output_shape[3];
+          in_data += input_shape[3];
+        }
+        out_data += pad_h * output_shape[3];
+      }
+    }
+  }
+};
+
+template class PadFunctor<CPU, float>;
+template class PadFunctor<CPU, int8_t>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/pad.h
+++ b/src/operators/math/pad.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename DeviceType, typename T>
+class PadFunctor {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile