From c02023ab58d29799bd5b80cdd7164702dcffce26 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 8 Mar 2019 22:01:03 +0800 Subject: [PATCH] Revert package lhs in sgemm and depthwise conv5x5 to make it no problem on ios --- src/framework/data_layout.h | 1 + src/framework/dim.h | 4 + .../kernel/central-arm-func/conv_arm_func.h | 20 +- src/operators/math/depthwise_conv5x5.cpp | 29 +- src/operators/math/gemm/executor.h | 9 +- src/operators/math/gemm/pack_kernel.h | 750 +++++++++++------- src/operators/math/gemm/strategy.h | 5 +- src/operators/math/gru_compute.cpp | 33 +- src/operators/math/math_function.cpp | 31 +- .../math/winograd/winograd_transform_f6k3.cpp | 12 +- tools/pre-commit.hooks/cpplint.hook | 2 +- 11 files changed, 543 insertions(+), 353 deletions(-) diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h index 665b5315bc..fd0bec3913 100644 --- a/src/framework/data_layout.h +++ b/src/framework/data_layout.h @@ -42,6 +42,7 @@ inline DataLayout StringToDataLayout(const std::string &str) { } else { PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str()) } + return DataLayout::kNCHW; } inline std::string DataLayoutToString(const DataLayout &data_layout) { diff --git a/src/framework/dim.h b/src/framework/dim.h index 7c78659e3b..e11d6fe39a 100644 --- a/src/framework/dim.h +++ b/src/framework/dim.h @@ -82,6 +82,8 @@ struct Dim<0> { int64_t &operator[](int idx); int64_t operator[](int idx) const; + + int64_t head; }; namespace { @@ -131,6 +133,7 @@ int64_t &indexer(Dim &dim, int idx) { template <> int64_t &indexer<0>(Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") + return dim.head; } template @@ -147,6 +150,7 @@ int64_t indexer(const Dim &dim, int idx) { template <> int64_t indexer<0>(const Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") + return dim.head; } } // namespace diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index bf7e3b0abc..b527c0bad9 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -201,16 +201,16 @@ inline void DepthwiseConv5x5(const ConvParam ¶m) { Tensor *output = param.Output(); output->mutable_data(); - if (strides[0] == 1) { - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - math::DepthwiseConv5x5S1(in_batch, *filter, paddings, - &out_batch); - } - } else { - GemmConv(param); - } + // if (strides[0] == 1) { + // for (int i = 0; i < batch_size; i++) { + // Tensor in_batch = input->Slice(i, i + 1); + // Tensor out_batch = output->Slice(i, i + 1); + // math::DepthwiseConv5x5S1(in_batch, *filter, paddings, + // &out_batch); + // } + // } else { + GemmConv(param); + // } } template diff --git a/src/operators/math/depthwise_conv5x5.cpp b/src/operators/math/depthwise_conv5x5.cpp index 306ebb85ff..99ddfc9249 100644 --- a/src/operators/math/depthwise_conv5x5.cpp +++ b/src/operators/math/depthwise_conv5x5.cpp @@ -144,20 +144,21 @@ void DepthwiseConv5x5S1(const framework::Tensor &input, const float *input_data = input.data(); const float *filter_data = filter.data(); float *out_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = padding_h; - int valid_h_end = output_h - valid_h_start; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = padding_w; - int valid_w_end = output_w - valid_w_start; - int valid_w = valid_w_end - valid_w_start; + + const int input_h = input.dims()[2]; + const int input_w = input.dims()[3]; + const int output_h = output->dims()[2]; + const int output_w = output->dims()[3]; + const int padding_h = paddings[0]; + const int padding_w = paddings[1]; + const int image_size = input_h * input_w; + const int out_image_size = output_h * output_w; + const int valid_h_start = padding_h; + const int valid_h_end = output_h - valid_h_start; + const int valid_h = valid_h_end - valid_h_start; + const int valid_w_start = padding_w; + const int valid_w_end = output_w - valid_w_start; + const int valid_w = valid_w_end - valid_w_start; #pragma omp parallel for for (int g = 0; g < input.dims()[1]; ++g) { diff --git a/src/operators/math/gemm/executor.h b/src/operators/math/gemm/executor.h index 93bf90c8b7..c629471c7c 100644 --- a/src/operators/math/gemm/executor.h +++ b/src/operators/math/gemm/executor.h @@ -18,7 +18,8 @@ limitations under the License. */ #ifdef _OPENMP #include #endif -#include +// #include +// #include #include "common/log.h" #include "memory/t_malloc.h" #include "operators/math/gemm/cpu_info.h" @@ -158,7 +159,8 @@ class GemmExecutor : public Executor { } } } - strategy_.write(lhs_range, N_, local_C, ldc_, C + lhs_block * ldc, ldc); + strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta, + C + lhs_block * ldc, ldc); } } else { strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true); @@ -188,7 +190,8 @@ class GemmExecutor : public Executor { } } } - strategy_.write(M_, rhs_range, local_C, ldc_, C + rhs_block, ldc); + strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta, + C + rhs_block, ldc); } } diff --git a/src/operators/math/gemm/pack_kernel.h b/src/operators/math/gemm/pack_kernel.h index 4018160510..31342ec1b7 100644 --- a/src/operators/math/gemm/pack_kernel.h +++ b/src/operators/math/gemm/pack_kernel.h @@ -31,275 +31,345 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) { void pack_lhs_6r(const int m, const int k, const float *A, const int lda, float *output, const bool unroll) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; - int remain_k = k & 0x3; - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); + float *zero = new float[k]; + memset(zero, 0, k * sizeof(float)); - #pragma omp parallel for if (unroll) - for (int i = 0; i < m - 5; i += 6) { + const int m_tail = m % 6; + const int i_length = m - m_tail; + for (int i = 0; i < i_length; i += 6) { const float *a0 = A + i * lda; const float *a1 = A + (i + 1) * lda; const float *a2 = A + (i + 2) * lda; const float *a3 = A + (i + 3) * lda; const float *a4 = A + (i + 4) * lda; const float *a5 = A + (i + 5) * lda; - float *out_ptr = output + i * k; - - int loops = k >> 2; - if (loops > 0) { -#if __aarch64__ - for (int l = 0; l < loops; ++l) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = - vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = - vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; - } -#else - asm volatile( - "loop_4k_%=: \n" - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - - "subs %[loops], #1 \n" - "bne loop_4k_%= \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif - } - - if (remain_k > 0) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - _d0 = vandq_f32_u32(_d0, vmask1); - _d1 = vandq_f32_u32(_d1, vmask1); - _d2 = vandq_f32_u32(_d2, vmask1); - _d3 = vandq_f32_u32(_d3, vmask1); - _d4 = vandq_f32_u32(_d4, vmask1); - _d5 = vandq_f32_u32(_d5, vmask1); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - - switch (remain_k) { - case 3: - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - case 2: - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - case 1: - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - default: - break; - } + float *local_buffer = output + i * k; + for (int j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + *local_buffer++ = *a4++; + *local_buffer++ = *a5++; } } - - int remain_m = m % 6; - if (remain_m) { - int remain_m_start = m - remain_m; - const float *a0 = A + remain_m_start * lda; + if (m_tail != 0) { + const float *a0 = A + i_length * lda; const float *a1 = a0 + lda; const float *a2 = a0 + 2 * lda; const float *a3 = a0 + 3 * lda; const float *a4 = a0 + 4 * lda; const float *a5 = a0 + 5 * lda; - float *out_ptr = output + remain_m_start * k; - - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); - uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m)); - - int loops = k >> 2; - if (loops > 0) { -#if __aarch64__ - for (int l = 0; l < loops; ++l) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = - vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = - vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - _d0 = vandq_f32_u32(_d0, vmask2); - _d1 = vandq_f32_u32(_d1, vmask2); - _d2 = vandq_f32_u32(_d2, vmask2); - _d3 = vandq_f32_u32(_d3, vmask2); - _d4 = vandq_f32_u32(_q3.val[0], vmask3); - _d5 = vandq_f32_u32(_q3.val[1], vmask3); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_d5)); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; - } -#else - asm volatile( - "loop_4k_%=: \n" - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vbif q0, %q[vzero], %q[vmask2] \n" - "vbif q1, %q[vzero], %q[vmask2] \n" - "vbif q2, %q[vzero], %q[vmask2] \n" - "vbif q3, %q[vzero], %q[vmask2] \n" - "vbif q4, %q[vzero], %q[vmask3] \n" - "vbif q5, %q[vzero], %q[vmask3] \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - - "subs %[loops], #1 \n" - "bne loop_4k_%= \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif + float *local_buffer = output + i_length * k; + switch (m_tail) { + case 1: + a1 = zero; + case 2: + a2 = zero; + case 3: + a3 = zero; + case 4: + a4 = zero; + case 5: + a5 = zero; + break; + default: + break; } - - if (remain_k > 0) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - _d0 = vandq_f32_u32(_d0, vmask1); - _d1 = vandq_f32_u32(_d1, vmask1); - _d2 = vandq_f32_u32(_d2, vmask1); - _d3 = vandq_f32_u32(_d3, vmask1); - _d4 = vandq_f32_u32(_d4, vmask1); - _d5 = vandq_f32_u32(_d5, vmask1); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), - // vget_high_f32(_q1.val[1])); - - _d0 = vandq_f32_u32(_d0, vmask2); - _d1 = vandq_f32_u32(_d1, vmask2); - _d2 = vandq_f32_u32(_d2, vmask2); - // _d3 = vandq_f32_u32(_d3, vmask2); - _d4 = vandq_f32_u32(_q3.val[0], vmask3); - _d5 = vandq_f32_u32(_q3.val[1], vmask3); - - switch (remain_k) { - case 3: - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - case 2: - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - case 1: - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - default: - break; - } + for (int j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + *local_buffer++ = *a4++; + *local_buffer++ = *a5++; } + delete[] zero; } + + // uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; + // int remain_k = k & 0x3; + // uint32x4_t vzero = vdupq_n_u32(0); + // uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); + // + // std::cout << "m: " << m << ", k: " << k << std::endl; + // #pragma omp parallel for if (unroll) + // for (int i = 0; i < m - 5; i += 6) { + // std::cout << "i: " << i << std::endl; + // const float *a0 = A + i * lda; + // const float *a1 = A + (i + 1) * lda; + // const float *a2 = A + (i + 2) * lda; + // const float *a3 = A + (i + 3) * lda; + // const float *a4 = A + (i + 4) * lda; + // const float *a5 = A + (i + 5) * lda; + // float *out_ptr = output + i * k; + // + // int loops = k >> 2; + // if (loops > 0) { + // #if __aarch64__ + // for (int l = 0; l < loops; ++l) { + // float32x4_t _d0 = vld1q_f32(a0); + // float32x4_t _d1 = vld1q_f32(a1); + // float32x4_t _d2 = vld1q_f32(a2); + // float32x4_t _d3 = vld1q_f32(a3); + // float32x4_t _d4 = vld1q_f32(a4); + // float32x4_t _d5 = vld1q_f32(a5); + // + // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), + // vget_low_f32(_q1.val[0])); _d1 = + // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); + // _d2 = + // vcombine_f32(vget_high_f32(_q0.val[0]), + // vget_high_f32(_q1.val[0])); + // _d3 = + // vcombine_f32(vget_high_f32(_q0.val[1]), + // vget_high_f32(_q1.val[1])); + // + // vst1q_f32(out_ptr, _d0); + // vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); + // vst1q_f32(out_ptr + 6, _d1); + // vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); + // vst1q_f32(out_ptr + 12, _d2); + // vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); + // vst1q_f32(out_ptr + 18, _d3); + // vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); + // + // a0 += 4; + // a1 += 4; + // a2 += 4; + // a3 += 4; + // a4 += 4; + // a5 += 4; + // out_ptr += 24; + // } + // #else + // asm volatile( + // "loop_4k_%=: \n" + // "vld1.32 {d0-d1}, [%[a0]]! \n" + // "vld1.32 {d2-d3}, [%[a1]]! \n" + // "vld1.32 {d4-d5}, [%[a2]]! \n" + // "vld1.32 {d6-d7}, [%[a3]]! \n" + // "vld1.32 {d8-d9}, [%[a4]]! \n" + // "vld1.32 {d10-d11}, [%[a5]]! \n" + // "vtrn.32 q0, q1 \n" + // "vtrn.32 q2, q3 \n" + // "vtrn.32 q4, q5 \n" + // "vswp.32 d1, d4 \n" + // "vswp.32 d3, d6 \n" + // + // "vst1.32 {q0}, [%[out]]! \n" + // "vst1.32 {d8}, [%[out]]! \n" + // "vst1.32 {q1}, [%[out]]! \n" + // "vst1.32 {d10}, [%[out]]! \n" + // "vst1.32 {q2}, [%[out]]! \n" + // "vst1.32 {d9}, [%[out]]! \n" + // "vst1.32 {q3}, [%[out]]! \n" + // "vst1.32 {d11}, [%[out]]! \n" + // + // "subs %[loops], #1 \n" + // "bne loop_4k_%= \n" + // : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] + // "+r"(a2), + // [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) + // : + // : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); + // #endif + // } + // + // if (remain_k > 0) { + // float32x4_t _d0 = vld1q_f32(a0); + // float32x4_t _d1 = vld1q_f32(a1); + // float32x4_t _d2 = vld1q_f32(a2); + // float32x4_t _d3 = vld1q_f32(a3); + // float32x4_t _d4 = vld1q_f32(a4); + // float32x4_t _d5 = vld1q_f32(a5); + // + // _d0 = vandq_f32_u32(_d0, vmask1); + // _d1 = vandq_f32_u32(_d1, vmask1); + // _d2 = vandq_f32_u32(_d2, vmask1); + // _d3 = vandq_f32_u32(_d3, vmask1); + // _d4 = vandq_f32_u32(_d4, vmask1); + // _d5 = vandq_f32_u32(_d5, vmask1); + // + // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), + // vget_low_f32(_q1.val[0])); _d1 = + // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2 + // = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); + // + // switch (remain_k) { + // case 3: + // vst1q_f32(out_ptr + 12, _d2); + // vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); + // case 2: + // vst1q_f32(out_ptr + 6, _d1); + // vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); + // case 1: + // vst1q_f32(out_ptr, _d0); + // vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); + // default: + // break; + // } + // } + // } + // + // int remain_m = m % 6; + // if (remain_m) { + // int remain_m_start = m - remain_m; + // std::cout << "remain_m_start: " << remain_m_start << std::endl; + // const float *a0 = A + remain_m_start * lda; + // const float *a1 = a0 + lda; + // const float *a2 = a0 + 2 * lda; + // const float *a3 = a0 + 3 * lda; + // const float *a4 = a0 + 4 * lda; + // const float *a5 = a0 + 5 * lda; + // float *out_ptr = output + remain_m_start * k; + // + // uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); + // uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), + // vdupq_n_u32(remain_m)); + // + // int loops = k >> 2; + // if (loops > 0) { + // #if __aarch64__ + // for (int l = 0; l < loops; ++l) { + // float32x4_t _d0 = vld1q_f32(a0); + // float32x4_t _d1 = vld1q_f32(a1); + // float32x4_t _d2 = vld1q_f32(a2); + // float32x4_t _d3 = vld1q_f32(a3); + // float32x4_t _d4 = vld1q_f32(a4); + // float32x4_t _d5 = vld1q_f32(a5); + // + // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), + // vget_low_f32(_q1.val[0])); _d1 = + // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); + // _d2 = + // vcombine_f32(vget_high_f32(_q0.val[0]), + // vget_high_f32(_q1.val[0])); + // _d3 = + // vcombine_f32(vget_high_f32(_q0.val[1]), + // vget_high_f32(_q1.val[1])); + // + // _d0 = vandq_f32_u32(_d0, vmask2); + // _d1 = vandq_f32_u32(_d1, vmask2); + // _d2 = vandq_f32_u32(_d2, vmask2); + // _d3 = vandq_f32_u32(_d3, vmask2); + // _d4 = vandq_f32_u32(_q3.val[0], vmask3); + // _d5 = vandq_f32_u32(_q3.val[1], vmask3); + // + // vst1q_f32(out_ptr, _d0); + // vst1_f32(out_ptr + 4, vget_low_f32(_d4)); + // vst1q_f32(out_ptr + 6, _d1); + // vst1_f32(out_ptr + 10, vget_low_f32(_d5)); + // vst1q_f32(out_ptr + 12, _d2); + // vst1_f32(out_ptr + 16, vget_high_f32(_d4)); + // vst1q_f32(out_ptr + 18, _d3); + // vst1_f32(out_ptr + 22, vget_high_f32(_d5)); + // + // a0 += 4; + // a1 += 4; + // a2 += 4; + // a3 += 4; + // a4 += 4; + // a5 += 4; + // out_ptr += 24; + // } + // #else + // asm volatile( + // "loop_4k_%=: \n" + // "vld1.32 {d0-d1}, [%[a0]]! \n" + // "vld1.32 {d2-d3}, [%[a1]]! \n" + // "vld1.32 {d4-d5}, [%[a2]]! \n" + // "vld1.32 {d6-d7}, [%[a3]]! \n" + // "vld1.32 {d8-d9}, [%[a4]]! \n" + // "vld1.32 {d10-d11}, [%[a5]]! \n" + // "vtrn.32 q0, q1 \n" + // "vtrn.32 q2, q3 \n" + // "vtrn.32 q4, q5 \n" + // "vswp.32 d1, d4 \n" + // "vswp.32 d3, d6 \n" + // + // "vbif q0, %q[vzero], %q[vmask2] \n" + // "vbif q1, %q[vzero], %q[vmask2] \n" + // "vbif q2, %q[vzero], %q[vmask2] \n" + // "vbif q3, %q[vzero], %q[vmask2] \n" + // "vbif q4, %q[vzero], %q[vmask3] \n" + // "vbif q5, %q[vzero], %q[vmask3] \n" + // + // "vst1.32 {q0}, [%[out]]! \n" + // "vst1.32 {d8}, [%[out]]! \n" + // "vst1.32 {q1}, [%[out]]! \n" + // "vst1.32 {d10}, [%[out]]! \n" + // "vst1.32 {q2}, [%[out]]! \n" + // "vst1.32 {d9}, [%[out]]! \n" + // "vst1.32 {q3}, [%[out]]! \n" + // "vst1.32 {d11}, [%[out]]! \n" + // + // "subs %[loops], #1 \n" + // "bne loop_4k_%= \n" + // : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] + // "+r"(a2), + // [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) + // : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) + // : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); + // #endif + // } + // + // if (remain_k > 0) { + // float32x4_t _d0 = vld1q_f32(a0); + // float32x4_t _d1 = vld1q_f32(a1); + // float32x4_t _d2 = vld1q_f32(a2); + // float32x4_t _d3 = vld1q_f32(a3); + // float32x4_t _d4 = vld1q_f32(a4); + // float32x4_t _d5 = vld1q_f32(a5); + // + // _d0 = vandq_f32_u32(_d0, vmask1); + // _d1 = vandq_f32_u32(_d1, vmask1); + // _d2 = vandq_f32_u32(_d2, vmask1); + // _d3 = vandq_f32_u32(_d3, vmask1); + // _d4 = vandq_f32_u32(_d4, vmask1); + // _d5 = vandq_f32_u32(_d5, vmask1); + // + // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), + // vget_low_f32(_q1.val[0])); _d1 = + // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2 + // = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); + // // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), + // // vget_high_f32(_q1.val[1])); + // + // _d0 = vandq_f32_u32(_d0, vmask2); + // _d1 = vandq_f32_u32(_d1, vmask2); + // _d2 = vandq_f32_u32(_d2, vmask2); + // // _d3 = vandq_f32_u32(_d3, vmask2); + // _d4 = vandq_f32_u32(_q3.val[0], vmask3); + // _d5 = vandq_f32_u32(_q3.val[1], vmask3); + // + // switch (remain_k) { + // case 3: + // vst1q_f32(out_ptr + 12, _d2); + // vst1_f32(out_ptr + 16, vget_high_f32(_d4)); + // case 2: + // vst1q_f32(out_ptr + 6, _d1); + // vst1_f32(out_ptr + 10, vget_low_f32(_d5)); + // case 1: + // vst1q_f32(out_ptr, _d0); + // vst1_f32(out_ptr + 4, vget_low_f32(_d4)); + // default: + // break; + // } + // } + // } } #if __aarch64__ @@ -575,9 +645,49 @@ void pack_rhs_8c(int k, int n, const float *B, int ldb, float *output, } #endif // __aarch64__ +void write_back_alpha_beta(const int mc, const int nc, const float alpha, + const float *c, const int ldc1, const float beta, + float *C, const int ldc2) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float32x4_t _alpha = vdupq_n_f32(alpha); + float32x4_t _beta = vdupq_n_f32(beta); + float32x4_t cv, cv2; + for (int i = 0; i < mc; ++i) { + const float *c_ptr = c + i * ldc1; + float *C_ptr = C + i * ldc2; + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv = vmulq_f32(_alpha, cv); + cv2 = vld1q_f32(C_ptr); + cv = vmlaq_f32(cv, _beta, cv2); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv = vmulq_f32(_alpha, cv); + cv2 = vld1q_f32(C_ptr); + cv = vmlaq_f32(cv, _beta, cv2); + switch (_nc1) { + case 3: + vst1q_lane_f32(C_ptr + 2, cv, 2); + case 2: + vst1_f32(C_ptr, vget_low_f32(cv)); + break; + case 1: + vst1q_lane_f32(C_ptr, cv, 0); + break; + } + } + } +} + #if __aarch64__ -void write_back(const int mc, const int nc, const float *c, const int ldc1, - float *C, const int ldc2) { +void write_back_alpha1_beta0(const int mc, const int nc, const float *c, + const int ldc1, float *C, const int ldc2) { int nc1 = nc / 4; int _nc1 = nc % 4; @@ -595,23 +705,60 @@ void write_back(const int mc, const int nc, const float *c, const int ldc1, } if (_nc1 != 0) { cv = vld1q_f32(c_ptr); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; + switch (_nc1) { + case 3: + vst1q_lane_f32(C_ptr + 2, cv, 2); + case 2: + vst1_f32(C_ptr, vget_low_f32(cv)); + break; + case 1: + vst1q_lane_f32(C_ptr, cv, 0); + break; } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); + } + } +} + +void write_back_alpha1_beta1(const int mc, const int nc, const float *c, + const int ldc1, float *C, const int ldc2) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + const float *c_ptr; + float *C_ptr; + float32x4_t cv, cv2; + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * ldc1; + C_ptr = C + i * ldc2; + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv2 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv2); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv2 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv2); + switch (_nc1) { + case 3: + vst1q_lane_f32(C_ptr + 2, cv, 2); + case 2: + vst1_f32(C_ptr, vget_low_f32(cv)); + break; + case 1: + vst1q_lane_f32(C_ptr, cv, 0); + break; } } } } + #else -void write_back(const int mc, const int nc, const float *c, const int ldc1, - float *C, const int ldc2) { +void write_back_alpha1_beta0(const int mc, const int nc, const float *c, + const int ldc1, float *C, const int ldc2) { int nc1 = nc / 16; int nc2 = nc % 16; int step1 = 4 * (ldc1 - 16 * nc1); @@ -663,7 +810,78 @@ void write_back(const int mc, const int nc, const float *c, const int ldc1, } } } -#endif + +void write_back_alpha1_beta1(const int mc, const int nc, const float *c, + const int ldc1, float *C, const int ldc2) { + int nc1 = nc / 16; + int nc2 = nc % 16; + int step1 = 4 * (ldc1 - 16 * nc1); + int step2 = 4 * ldc2; + int volatile m = mc; + + const float *volatile c_ptr = c; + float *volatile C_ptr = C; + if (nc1 > 0) { + asm volatile( + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r6, %[C_ptr] \n\t" + "mov r5, %[nc1] \n\t" + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" + "vld1.32 {q2, q3}, [r6] \n\t" + "vadd.f32 q0, q0, q2 \n\t" + "vadd.f32 q1, q1, q3 \n\t" + "vst1.32 {q0, q1}, [r6]! \n\t" + + "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" + "vld1.32 {q2, q3}, [r6] \n\t" + "vadd.f32 q0, q0, q2 \n\t" + "vadd.f32 q1, q1, q3 \n\t" + "vst1.32 {q0, q1}, [r6]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[c_ptr], %[c_ptr], %[step1] \n\t" + "add %[C_ptr], %[C_ptr], %[step2] \n\t" + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + : + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), + [step1] "r"(step1), [step2] "r"(step2) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); + } + + if (nc2 != 0) { + for (int i = 0; i < mc; i++) { + const float *c0 = c_ptr + nc1 * 16 + i * ldc1; + float *C0 = C_ptr + nc1 * 16 + i * ldc2; + for (int j = 0; j < nc2; j++) { + *C0++ += *c0++; + } + } + } +} +#endif // __aarch64__ + +void write_back(const int mc, const int nc, const float alpha, const float *c, + const int ldc1, const float beta, float *C, const int ldc2) { + if (alpha == 1.f && beta == 0.f) { + write_back_alpha1_beta0(mc, nc, c, ldc1, C, ldc2); + } else if (alpha == 1.f && beta == 1.f) { + write_back_alpha1_beta1(mc, nc, c, ldc1, C, ldc2); + } else { + write_back_alpha_beta(mc, nc, alpha, c, ldc1, beta, C, ldc2); + } +} } // namespace math } // namespace operators diff --git a/src/operators/math/gemm/strategy.h b/src/operators/math/gemm/strategy.h index 9277146513..51417a3b4b 100644 --- a/src/operators/math/gemm/strategy.h +++ b/src/operators/math/gemm/strategy.h @@ -31,8 +31,9 @@ struct SgemmStrategy { Itype *, const bool); typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *, const int); - typedef void (*WriteFunc)(const int, const int, const Otype *, const int, - Otype *, const int); + typedef void (*WriteFunc)(const int, const int, const float alpha, + const Otype *, const int, const float beta, Otype *, + const int); packLhsFunc pack_lhs; packRhsFunc pack_rhs; diff --git a/src/operators/math/gru_compute.cpp b/src/operators/math/gru_compute.cpp index 19c7a2685c..d30ea5aa47 100644 --- a/src/operators/math/gru_compute.cpp +++ b/src/operators/math/gru_compute.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/math/gru_compute.h" #include "common/types.h" #include "operators/math/activation.h" -#include "operators/math/gemm.h" +#include "operators/math/gemm/cblas.h" #include "operators/math/gru_cpu_kernel.h" namespace paddle_mobile { @@ -29,35 +29,19 @@ struct GRUUnitFunctor { static void compute(GRUMetaValue value, int frame_size, int batch_size, const ActivationType active_node, const ActivationType active_gate) { - Gemm gemm; if (value.prev_out_value) { -#ifdef _OPENMP - gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1, - value.prev_out_value, frame_size, value.gate_weight, - frame_size * 2, 1, value.gate_value, frame_size * 3, false, - static_cast(nullptr)); -#else - gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1, - value.prev_out_value, frame_size, value.gate_weight, - frame_size * 2, 1, value.gate_value, frame_size * 3, false, - static_cast(nullptr)); -#endif + cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f, + value.prev_out_value, frame_size, value.gate_weight, + frame_size * 2, 1.f, value.gate_value, frame_size * 3); } forward_reset_output(value, frame_size, batch_size, active_gate); if (value.prev_out_value) { -#ifdef _OPENMP - gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1, - value.reset_output_value, frame_size, value.state_weight, - frame_size, 1, value.gate_value + frame_size * 2, - frame_size * 3, false, static_cast(nullptr)); -#else - gemm.Sgemm(batch_size, frame_size, frame_size, 1, - value.reset_output_value, frame_size, value.state_weight, - frame_size, 1, value.gate_value + frame_size * 2, - frame_size * 3, false, static_cast(nullptr)); -#endif + cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f, + value.reset_output_value, frame_size, value.state_weight, + frame_size, 1.f, value.gate_value + frame_size * 2, + frame_size * 3); } forward_final_output(value, frame_size, batch_size, active_node); @@ -65,6 +49,7 @@ struct GRUUnitFunctor { }; template struct GRUUnitFunctor; + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index 6cd0514832..4a35dd2a57 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -71,34 +71,11 @@ void MatMul(const framework::Tensor &matrix_a, bool trans_a, a[index++] = tmp[i * n + j]; } } - if (M == 1) { -#ifdef _OPENMP - gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); -#else - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); -#endif - } else { - cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data(), N, - beta, matrix_out->data(), N); - } + cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data(), N, + beta, matrix_out->data(), N); } else { - if (M == 1) { -#ifdef _OPENMP - gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); -#else - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), N, - relu, bias); -#endif - } else { - cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N); - } + cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data(), K, + matrix_b.data(), N, beta, matrix_out->data(), N); } } diff --git a/src/operators/math/winograd/winograd_transform_f6k3.cpp b/src/operators/math/winograd/winograd_transform_f6k3.cpp index 7972196656..234de599ad 100644 --- a/src/operators/math/winograd/winograd_transform_f6k3.cpp +++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp @@ -803,9 +803,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "dup v15.4s, wzr \n" "cmp %[inter], #0 \n" - "ble loop_1c_%= \n" + "ble 2f \n" // loop 2 channels - "loop_2c_%=: \n" + "1: \n" "ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n" "ld1 {v4.4s, v5.4s}, [%[in_ptr]], #32 \n" @@ -829,12 +829,12 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "fmla v15.4s, v5.4s, v1.s[3] \n" "subs %[inter], %[inter], #1 \n" - "bne loop_2c_%= \n" + "bne 1b \n" // loop 1 channel - "loop_1c_%=: \n" + "2: \n" "cmp %[remain], #0 \n" - "ble store_res_%= \n" + "ble 3f \n" "ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n" @@ -847,7 +847,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "fmla v14.4s, v2.4s, v0.s[3] \n" "fmla v15.4s, v3.4s, v0.s[3] \n" - "store_res_%=: \n" + "3: \n" "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n" "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n" : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr), diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook index 78ca3cfcdd..7f4db9fbfb 100644 --- a/tools/pre-commit.hooks/cpplint.hook +++ b/tools/pre-commit.hooks/cpplint.hook @@ -5,7 +5,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ - grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do + grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "dim.h"); do cpplint $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done -- GitLab