From 022f12916b421ea311b4efe2874315937d406b18 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 23 Oct 2018 02:58:26 +0000 Subject: [PATCH] Optimize elementwise/relu/im2col, support 1x1 and 7x7 conv using int8, fix some code style --- src/io/executor.cpp | 10 +- src/operators/dequantize_op.cpp | 4 + src/operators/dequantize_op.h | 4 + .../kernel/arm/dequantize_kernel.cpp | 2 +- src/operators/kernel/arm/quantize_kernel.cpp | 4 +- .../elementwise_add_arm_func.h | 46 +++++ .../kernel/central-arm-func/relu_arm_func.h | 162 +++++++++++------- src/operators/kernel/dequantize_kernel.h | 4 + src/operators/kernel/quantize_kernel.h | 4 + src/operators/math/conv3x3_arm_int8.cpp | 6 +- src/operators/math/conv5x5_arm_int8.cpp | 5 +- src/operators/math/gemm_int8.cpp | 2 + src/operators/math/im2col.cpp | 140 +++++++++------ src/operators/quantize_op.cpp | 4 + src/operators/quantize_op.h | 4 + test/operators/test_int8_conv_op.cpp | 89 ++++++---- tools/op.cmake | 9 + tools/pre-commit.hooks/cpplint.hook | 2 +- 18 files changed, 340 insertions(+), 161 deletions(-) diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 100a774054..9efec27c9d 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -80,12 +80,13 @@ Executor::Executor(const framework::Program p, int batch_size, } template -void LoadMemInternal(void **data, framework::LoDTensor *tensor) { +static void LoadMemInternal(void **data, framework::LoDTensor *tensor, + bool quant_uint8 = false) { char **data_buf = reinterpret_cast(data); int64_t size = tensor->numel(); Dtype *tensor_data = tensor->mutable_data(); - if (0) { - // TODO(hjchen2) should be moved into operator init function + if (quant_uint8) { + // should be moved into operator init function float min_value; float max_value; memcpy(&min_value, data_buf, sizeof(float)); @@ -141,7 +142,8 @@ void Executor::LoadMemory( // parse tensor from stream switch (tensor_desc.DataType()) { case framework::VARTYPE_TYPE_FP32: - LoadMemInternal(reinterpret_cast(data_buf), tensor); + LoadMemInternal(reinterpret_cast(data_buf), tensor, + program_.quantification); break; case framework::VARTYPE_TYPE_INT8: LoadMemInternal(reinterpret_cast(data_buf), tensor); diff --git a/src/operators/dequantize_op.cpp b/src/operators/dequantize_op.cpp index df835e3007..21cd96368c 100644 --- a/src/operators/dequantize_op.cpp +++ b/src/operators/dequantize_op.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef DEQUANT_OP + #include "operators/dequantize_op.h" namespace paddle_mobile { @@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp); #endif + +#endif diff --git a/src/operators/dequantize_op.h b/src/operators/dequantize_op.h index 4855f27fc8..906167a9a2 100644 --- a/src/operators/dequantize_op.h +++ b/src/operators/dequantize_op.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef DEQUANT_OP + #pragma once #include @@ -41,3 +43,5 @@ class DequantizeOp } // namespace operators } // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp index 935ce470a8..cd6c8d17f1 100644 --- a/src/operators/kernel/arm/dequantize_kernel.cpp +++ b/src/operators/kernel/arm/dequantize_kernel.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_MOBILE_CPU +#ifdef DEQUANT_OP #include "operators/kernel/dequantize_kernel.h" diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index fe8256a1ea..e7552d2602 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_MOBILE_CPU +#ifdef QUANT_OP #include "operators/kernel/quantize_kernel.h" #include @@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, const float *x = input->data(); int8_t *y = output->mutable_data(); size_t size = input->numel(); -#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) +#if defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; for (size_t i = 0; i < loop; ++i) { diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h index ace72b6fad..75d32c7985 100644 --- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h @@ -15,8 +15,12 @@ limitations under the License. */ #ifdef ELEMENTWISEADD_OP #pragma once + #include "operators/math/elementwise_op_function.h" #include "operators/op_param.h" +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif namespace paddle_mobile { namespace operators { @@ -33,8 +37,50 @@ void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { Tensor *Out = param.Out(); Out->mutable_data(); int axis = param.Axis(); +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + size_t batch = 1; + size_t elementwise_num = 1; + for (int i = 0; i < axis; ++i) { + batch *= input_x->dims()[i]; + } + for (int i = axis + 1; i < input_x->dims().size(); ++i) { + elementwise_num *= input_x->dims()[i]; + } + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < input_x->dims()[axis]; ++j) { + size_t offset = (i * input_x->dims()[axis] + j) * elementwise_num; + const float *input = input_x->data() + offset; + const float *bias = input_y->data() + j; + float *output = Out->mutable_data() + offset; + + int loop = elementwise_num >> 0x4; + int remain = elementwise_num & 0xF; + for (int k = 0; k < loop; ++k) { + float32x4_t rb = vdupq_n_f32(*bias); + float32x4_t r0 = vld1q_f32(input); + float32x4_t r1 = vld1q_f32(input + 4); + float32x4_t r2 = vld1q_f32(input + 8); + float32x4_t r3 = vld1q_f32(input + 12); + r0 = vaddq_f32(r0, rb); + r1 = vaddq_f32(r1, rb); + r2 = vaddq_f32(r2, rb); + r3 = vaddq_f32(r3, rb); + vst1q_f32(output, r0); + vst1q_f32(output + 4, r1); + vst1q_f32(output + 8, r2); + vst1q_f32(output + 12, r3); + input += 16; + output += 16; + } + for (int k = 0; k < remain; ++k) { + output[k] = input[k] + *bias; + } + } + } +#else ElementwiseComputeEx, float>(input_x, input_y, axis, AddFunctor(), Out); +#endif } template class ElementwiseAddKernel; diff --git a/src/operators/kernel/central-arm-func/relu_arm_func.h b/src/operators/kernel/central-arm-func/relu_arm_func.h index d68569c0a5..38b2e6f334 100644 --- a/src/operators/kernel/central-arm-func/relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/relu_arm_func.h @@ -17,6 +17,9 @@ limitations under the License. */ #include #include "operators/op_param.h" +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif namespace paddle_mobile { namespace operators { @@ -37,71 +40,100 @@ void ReluCompute(const ReluParam ¶m) { auto *out_ptr = out->mutable_data(); int numel = input_x->numel(); - // if (numel > 64) { - // asm volatile( - // "pld [%[input_x_ptr], #0] \n\t" - // "vmov.f32 q8, #0.0 \n\t" - // "subs %[num], %[num], #32 \n\t" - // "blt end_num_%= \n\t" - // "loop_num_%=: \n\t" - // "pld [%[input_x_ptr], #1024] \n\t" - // - // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" - // - // "vmax.f32 q0, q0, q8 \n\t" - // "vmax.f32 q1, q1, q8 \n\t" - // "vmax.f32 q2, q2, q8 \n\t" - // "vmax.f32 q3, q3, q8 \n\t" - // "vmax.f32 q4, q4, q8 \n\t" - // "vmax.f32 q5, q5, q8 \n\t" - // "vmax.f32 q6, q6, q8 \n\t" - // "vmax.f32 q7, q7, q8 \n\t" - // - // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" - // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" - // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" - // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" - // - // "subs %[num], %[num], #32 \n\t" - // "bge loop_num_%= \n\t" - // "end_num_%=: \n\t" - // "cmp %[num], #0 \n\t" - // "bge end_%= \n\t" - // "mov r6, #4 \n\t" - // "mul r5, %[num], r6 \n\t" - // "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" - // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" - // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" - // "vmax.f32 q0, q0, q8 \n\t" - // "vmax.f32 q1, q1, q8 \n\t" - // "vmax.f32 q2, q2, q8 \n\t" - // "vmax.f32 q3, q3, q8 \n\t" - // "vmax.f32 q4, q4, q8 \n\t" - // "vmax.f32 q5, q5, q8 \n\t" - // "vmax.f32 q6, q6, q8 \n\t" - // "vmax.f32 q7, q7, q8 \n\t" - // "add %[out_ptr], %[out_ptr], r5 \n\t" - // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" - // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" - // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" - // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" - // "end_%=: \n\t" - // : - // : - // [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] - // "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - // "q7", "q8", "r5", - // "r6"); - // } else { - ReluFunctor func_; - math::Transform trans; - trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); - // } +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#if __aarch64__ + if (numel > 0) { + int loop = numel >> 0x4; + int remain = numel & 0xF; + float32x4_t zero = vdupq_n_f32(0.f); + for (int i = 0; i < loop; ++i) { + float32x4_t r0 = vld1q_f32(input_x_ptr); + float32x4_t r1 = vld1q_f32(input_x_ptr + 4); + float32x4_t r2 = vld1q_f32(input_x_ptr + 8); + float32x4_t r3 = vld1q_f32(input_x_ptr + 12); + r0 = vmaxq_f32(r0, zero); + r1 = vmaxq_f32(r1, zero); + r2 = vmaxq_f32(r2, zero); + r3 = vmaxq_f32(r3, zero); + vst1q_f32(out_ptr, r0); + vst1q_f32(out_ptr + 4, r1); + vst1q_f32(out_ptr + 8, r2); + vst1q_f32(out_ptr + 12, r3); + input_x_ptr += 16; + out_ptr += 16; + } + for (int i = 0; i < remain; ++i) { + out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i]; + } +#else + if (numel > 64) { + asm volatile( + "pld [%[input_x_ptr], #0] \n\t" + "vmov.f32 q8, #0.0 \n\t" + "subs %[num], %[num], #32 \n\t" + "blt end_num_%= \n\t" + "loop_num_%=: \n\t" + "pld [%[input_x_ptr], #1024] \n\t" + + "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + + "vmax.f32 q0, q0, q8 \n\t" + "vmax.f32 q1, q1, q8 \n\t" + "vmax.f32 q2, q2, q8 \n\t" + "vmax.f32 q3, q3, q8 \n\t" + "vmax.f32 q4, q4, q8 \n\t" + "vmax.f32 q5, q5, q8 \n\t" + "vmax.f32 q6, q6, q8 \n\t" + "vmax.f32 q7, q7, q8 \n\t" + + "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + + "subs %[num], %[num], #32 \n\t" + "bge loop_num_%= \n\t" + "end_num_%=: \n\t" + "cmp %[num], #0 \n\t" + "bge end_%= \n\t" + "mov r6, #4 \n\t" + "mul r5, %[num], r6 \n\t" + "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" + "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + "vmax.f32 q0, q0, q8 \n\t" + "vmax.f32 q1, q1, q8 \n\t" + "vmax.f32 q2, q2, q8 \n\t" + "vmax.f32 q3, q3, q8 \n\t" + "vmax.f32 q4, q4, q8 \n\t" + "vmax.f32 q5, q5, q8 \n\t" + "vmax.f32 q6, q6, q8 \n\t" + "vmax.f32 q7, q7, q8 \n\t" + "add %[out_ptr], %[out_ptr], r5 \n\t" + "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + "end_%=: \n\t" + : + : + [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5", + "r6"); +#endif + } else { +#endif + ReluFunctor func_; + math::Transform trans; + trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + } +#endif } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h index 3d0437875b..d147e3f94a 100644 --- a/src/operators/kernel/dequantize_kernel.h +++ b/src/operators/kernel/dequantize_kernel.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef DEQUANT_OP + #pragma once #include "framework/operator.h" @@ -30,3 +32,5 @@ class DequantizeKernel } // namespace operators } // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h index 7a35d03ba7..c55ca2182a 100644 --- a/src/operators/kernel/quantize_kernel.h +++ b/src/operators/kernel/quantize_kernel.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef QUANT_OP + #pragma once #include "framework/operator.h" @@ -30,3 +32,5 @@ class QuantizeKernel } // namespace operators } // namespace paddle_mobile + +#endif diff --git a/src/operators/math/conv3x3_arm_int8.cpp b/src/operators/math/conv3x3_arm_int8.cpp index f8a4e9f409..283dcb2255 100644 --- a/src/operators/math/conv3x3_arm_int8.cpp +++ b/src/operators/math/conv3x3_arm_int8.cpp @@ -36,7 +36,9 @@ void conv3x3s1_int8(const framework::Tensor& input, int image_size = input_h * input_w; int out_image_size = output_h * output_w; memset(out_data, 0, output_c * out_image_size * sizeof(int32_t)); - +#if __aarch64__ + // TODO(hjchen2) +#else int oc = 0; #pragma omp parallel for for (; oc < output_c - 1; oc += 2) { @@ -747,7 +749,7 @@ void conv3x3s1_int8(const framework::Tensor& input, } } } - +#endif #else // TODO(hjchen2) #endif diff --git a/src/operators/math/conv5x5_arm_int8.cpp b/src/operators/math/conv5x5_arm_int8.cpp index 7a0f0a40fb..c861c22d18 100644 --- a/src/operators/math/conv5x5_arm_int8.cpp +++ b/src/operators/math/conv5x5_arm_int8.cpp @@ -36,7 +36,9 @@ void conv5x5s1_int8(const framework::Tensor& input, int image_size = input_h * input_w; int out_image_size = output_h * output_w; memset(out_data, 0, output_c * out_image_size * sizeof(int32_t)); - +#if __aarch64__ + // TODO(hjchen2) +#else #pragma omp parallel for for (int oc = 0; oc < output_c; ++oc) { for (int ic = 0; ic < input_c; ++ic) { @@ -537,6 +539,7 @@ void conv5x5s1_int8(const framework::Tensor& input, } } } +#endif #else // TODO(hjchen2) #endif diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index bd5286dbcb..51953dbf0c 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -642,6 +642,7 @@ void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, // C = A * B, 8位 int32_t void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc) { +#if __ARM_NEON int32_t nc1 = nc >> 4; int32_t _nc1 = nc & 15; int32_t step = sizeof(int32_t) * ldc; @@ -695,6 +696,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, } } } +#endif // __ARM_NEON } // C = A * B + C diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index 502e29a7a9..4c81e7fa3b 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -397,7 +397,7 @@ void Im2ColFunctor::operator()( col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) + ? static_cast(0) : im_data[im_idx]; } } @@ -405,10 +405,68 @@ void Im2ColFunctor::operator()( #endif } -// TODO(hjchen2) -void ExtractToRows1() {} - -void ExtractToRows2() {} +void ExtractToImg(const int8_t *im_data, int8_t *col_data, const int im_height, + const int im_width, const int col_height, const int col_width, + const int padding_h, const int padding_w, const int stride_h, + const int stride_w, const int kh, const int kw) { + int h = padding_h - kh; + int w = padding_w - kw; + int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; + int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; + int start_height = kh + col_start_height * stride_h - padding_h; + int start_width = kw + col_start_width * stride_w - padding_w; + + int end_height = (col_height - col_start_height) * stride_h + start_height; + end_height = end_height > im_height ? im_height : end_height; + int end_width = (col_width - col_start_width) * stride_w + start_width; + end_width = end_width > im_width ? im_width : end_width; + int extract = (end_width - start_width + stride_w - 1) / stride_w; + + im_data += start_height * im_width + start_width; + col_data += col_start_height * col_width + col_start_width; + for (int i = start_height; i < end_height; i += stride_h) { + if (stride_w == 1) { + memcpy(col_data, im_data, extract * sizeof(int8_t)); + } else if (stride_w == 2) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + int8x16x2_t img = vld2q_s8(im_data + s * 2); + vst1q_s8(col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + col_data[s] = im_data[s * 2]; + } + } else if (stride_w == 3) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + int8x16x3_t img = vld3q_s8(im_data + s * 3); + vst1q_s8(col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + col_data[s] = im_data[s * 3]; + } + } else if (stride_w == 4) { + int s = 0; +#if __ARM_NEON + for (; s < extract - 15; s += 16) { + int8x16x4_t img = vld4q_s8(im_data + s * 4); + vst1q_s8(col_data + s, img.val[0]); + } +#endif + for (; s < extract; ++s) { + col_data[s] = im_data[s * 4]; + } + } else { + PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4."); + } + im_data += im_width * stride_h; + col_data += col_width; + } +} /* * im = [input_channels, input_height, input_width] @@ -432,64 +490,42 @@ void Im2ColFunctor::operator()( int channels_col = im_channels * filter_height * filter_width; const int8_t *im_data = im.data(); int8_t *col_data = col->data(); -// #if defined(__ARM_NEON__) || defined(__ARM_NEON) -#if 0 - if (stride[0] == stride[1] && stride[0] == 1 && dilation[0] == 1 && - padding[0] == padding[1] && dilation[0] == dilation[1]) { +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { // pad 0 memset(col_data, 0, col->numel() * sizeof(int8_t)); for (int ic = 0; ic < im_channels; ++ic) { - for (int oh = 0; oh < padding[0]; ++oh) { - for (int k = 0; k < filter_height * filter_width; ++k) { - ExtractToRows1(); - ExtractToRows1(); - } - } - for (int oh = padding[0]; oh < col_height - padding[0]; ++oh) { - for (int k = 0; k < filter_height * filter_width; ++k) { - ExtractToRows1(); - } - } - } - } else if (stride[0] == stride[1] && stride[0] == 2 && dilation[0] == 1 && - padding[0] == padding[1] && dilation[0] == dilation[1]) { - // pad 0 - memset(col_data, 0, col->numel() * sizeof(int8_t)); - for (int ic = 0; ic < im_channels; ++ic) { - for (int oh = 0; oh < padding[0]; ++oh) { - for (int k = 0; k < filter_height * filter_width; ++k) { - ExtractToRows2(); - ExtractToRows2(); - } - } - for (int oh = padding[0]; oh < col_height - padding[0]; ++oh) { - for (int k = 0; k < filter_height * filter_width; ++k) { - ExtractToRows2(); + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + ExtractToImg(im_data, col_data, im_height, im_width, col_height, + col_width, padding[0], padding[1], stride[0], stride[1], + kh, kw); + col_data += col_height * col_width; } } + im_data += im_height * im_width; } } else { #endif - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * col_height + h) * col_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < col_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * col_height + h) * col_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } } } - } -// #if defined(__ARM_NEON__) || defined(__ARM_NEON) -#if 0 +#if defined(__ARM_NEON__) || defined(__ARM_NEON) } #endif } diff --git a/src/operators/quantize_op.cpp b/src/operators/quantize_op.cpp index 7958b054de..865539d7d2 100644 --- a/src/operators/quantize_op.cpp +++ b/src/operators/quantize_op.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef QUANT_OP + #include "operators/quantize_op.h" #include @@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp); #endif + +#endif diff --git a/src/operators/quantize_op.h b/src/operators/quantize_op.h index 2b0d2f8e32..ca04c1213a 100644 --- a/src/operators/quantize_op.h +++ b/src/operators/quantize_op.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef QUANT_OP + #pragma once #include @@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel< } // namespace operators } // namespace paddle_mobile + +#endif diff --git a/test/operators/test_int8_conv_op.cpp b/test/operators/test_int8_conv_op.cpp index fd9e45e9a2..2ab40ba583 100644 --- a/test/operators/test_int8_conv_op.cpp +++ b/test/operators/test_int8_conv_op.cpp @@ -140,10 +140,10 @@ int TestConvOp() { int dilation_w = 1; int batch_size = 1; - int input_c = 63; - int input_h = 51; - int input_w = 51; - int output_c = 125; + int input_c = 3; + int input_h = 100; + int input_w = 100; + int output_c = 10; framework::DDim input_shape = framework::make_ddim({batch_size, input_c, input_h, input_w}); framework::DDim filter_shape = @@ -174,40 +174,38 @@ int TestConvOp() { auto *op = new operators::ConvOp("conv2d", inputs, outputs, attrs, scope); - struct timespec ts_begin, ts_end; + // struct timespec ts_begin, ts_end; op->InferShape(); // warmup + // op->Run(); + // clock_gettime(CLOCK_MONOTONIC, &ts_begin); + // for (int i = 0; i < 10; ++i) { op->Run(); - clock_gettime(CLOCK_MONOTONIC, &ts_begin); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - clock_gettime(CLOCK_MONOTONIC, &ts_end); - uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 + - (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6; - LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms"; + // } + // clock_gettime(CLOCK_MONOTONIC, &ts_end); + // uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 + + // (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6; + // LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms"; - /* - int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1; - int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1; - auto output_shape = framework::make_ddim( - std::vector({batch_size, output_c, output_h, output_w})); - framework::Tensor output_cmp; - output_cmp.mutable_data(output_shape); - conv2d(input, filter, attrs, &output_cmp); + int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1; + int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1; + auto output_shape = framework::make_ddim( + std::vector({batch_size, output_c, output_h, output_w})); + framework::Tensor output_cmp; + output_cmp.mutable_data(output_shape); + conv2d(input, filter, attrs, &output_cmp); - // compare results - auto output = output_var->template Get(); - const Otype *output_data = output->data(); - Otype *output_cmp_data = output_cmp.data(); - for (int i = 0; i < output->numel(); ++i) { - PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - "output[%d] = %d, output_cmp[%d] = %d", i, - output_data[i], i, output_cmp_data[i]); - } - */ + // compare results + auto output = output_var->template Get(); + const Otype *output_data = output->data(); + Otype *output_cmp_data = output_cmp.data(); + for (int i = 0; i < output->numel(); ++i) { + PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], + "output[%d] = %d, output_cmp[%d] = %d", i, + output_data[i], i, output_cmp_data[i]); + } delete op; return 0; } @@ -219,10 +217,35 @@ int main() { LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2"; paddle_mobile::TestConvOp(); + // kernel = 7, pad = 1, stride = 2 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2"; + paddle_mobile::TestConvOp(); + // kernel = 7, pad = 3, stride = 2 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2"; paddle_mobile::TestConvOp(); + // kernel = 7, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1"; + paddle_mobile::TestConvOp(); + + // kernel = 7, pad = 1, stride = 1 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1"; + paddle_mobile::TestConvOp(); + + // kernel = 7, pad = 3, stride = 1 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1"; + paddle_mobile::TestConvOp(); + + // kernel = 7, pad = 5, stride = 3 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3"; + paddle_mobile::TestConvOp(); + + // kernel = 7, pad = 3, stride = 4 + LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4"; + paddle_mobile::TestConvOp(); + LOG(paddle_mobile::kLOG_INFO) << "\n"; + // kernel = 3, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1"; paddle_mobile::TestConvOp(); diff --git a/tools/op.cmake b/tools/op.cmake index 6e89fa4f66..bb82d134b8 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -222,6 +222,8 @@ if(NOT FOUND_MATCH) set(SHAPE_OP ON) set(ELEMENTWISEMUL_OP ON) set(SUM_OP ON) + set(QUANT_OP ON) + set(DEQUANT_OP ON) endif() # option(BATCHNORM_OP "" ON) @@ -401,3 +403,10 @@ if (SUM_OP) add_definitions(-DSUM_OP) endif() +if (QUANT_OP) + add_definitions(-DQUANT_OP) +endif() +if (DEQUANT_OP) + add_definitions(-DDEQUANT_OP) +endif() + diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook index 26c25c2e12..78ca3cfcdd 100644 --- a/tools/pre-commit.hooks/cpplint.hook +++ b/tools/pre-commit.hooks/cpplint.hook @@ -5,7 +5,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ - grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "variant.h"); do + grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do cpplint $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done -- GitLab