Revert package lhs in sgemm and depthwise conv5x5 to make it no problem on ios

73a7f138 · hjchen2 · 269a3a37 · 73a7f138 · 73a7f138 · 73a7f138
11 changed file
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -42,6 +42,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
  }
+  return DataLayout::kNCHW;
 }
 inline std::string DataLayoutToString(const DataLayout &data_layout) {

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -82,6 +82,8 @@ struct Dim<0> {
  int64_t &operator[](int idx);
  int64_t operator[](int idx) const;
+  int64_t head;
 };
 namespace {
@@ -131,6 +133,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }
 template <int D>
@@ -147,6 +150,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }
 }  // namespace

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -201,16 +201,16 @@ inline void DepthwiseConv5x5(const ConvParam<CPU> &param) {
  Tensor *output = param.Output();
  output->mutable_data<Otype>();
-  if (strides[0] == 1) {
+  //  if (strides[0] == 1) {
-    for (int i = 0; i < batch_size; i++) {
+  //    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
+  //      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
+  //      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
+  //      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
+  //                                             &out_batch);
-    }
+  //    }
-  } else {
+  //  } else {
-    GemmConv<Itype, Otype>(param);
+  GemmConv<Itype, Otype>(param);
-  }
+  //  }
 }
 template <typename ParamType>

--- a/src/operators/math/depthwise_conv5x5.cpp
+++ b/src/operators/math/depthwise_conv5x5.cpp
@@ -144,20 +144,21 @@ void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
  const float *input_data = input.data<float>();
  const float *filter_data = filter.data<float>();
  float *out_data = output->mutable_data<float>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
+  const int input_h = input.dims()[2];
-  int output_h = output->dims()[2];
+  const int input_w = input.dims()[3];
-  int output_w = output->dims()[3];
+  const int output_h = output->dims()[2];
-  int padding_h = paddings[0];
+  const int output_w = output->dims()[3];
-  int padding_w = paddings[1];
+  const int padding_h = paddings[0];
-  int image_size = input_h * input_w;
+  const int padding_w = paddings[1];
-  int out_image_size = output_h * output_w;
+  const int image_size = input_h * input_w;
-  int valid_h_start = padding_h;
+  const int out_image_size = output_h * output_w;
-  int valid_h_end = output_h - valid_h_start;
+  const int valid_h_start = padding_h;
-  int valid_h = valid_h_end - valid_h_start;
+  const int valid_h_end = output_h - valid_h_start;
-  int valid_w_start = padding_w;
+  const int valid_h = valid_h_end - valid_h_start;
-  int valid_w_end = output_w - valid_w_start;
+  const int valid_w_start = padding_w;
-  int valid_w = valid_w_end - valid_w_start;
+  const int valid_w_end = output_w - valid_w_start;
+  const int valid_w = valid_w_end - valid_w_start;
  #pragma omp parallel for
  for (int g = 0; g < input.dims()[1]; ++g) {

--- a/src/operators/math/gemm/executor.h
+++ b/src/operators/math/gemm/executor.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <sys/time.h>
+// #include <sys/time.h>
+// #include <iostream>
 #include "common/log.h"
 #include "memory/t_malloc.h"
 #include "operators/math/gemm/cpu_info.h"
@@ -158,7 +159,8 @@ class GemmExecutor : public Executor {
            }
          }
        }
-        strategy_.write(lhs_range, N_, local_C, ldc_, C + lhs_block * ldc, ldc);
+        strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta,
+                        C + lhs_block * ldc, ldc);
      }
    } else {
      strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true);
@@ -188,7 +190,8 @@ class GemmExecutor : public Executor {
            }
          }
        }
-        strategy_.write(M_, rhs_range, local_C, ldc_, C + rhs_block, ldc);
+        strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta,
+                        C + rhs_block, ldc);
      }
    }

--- a/src/operators/math/gemm/pack_kernel.h
+++ b/src/operators/math/gemm/pack_kernel.h
--- a/src/operators/math/gemm/strategy.h
+++ b/src/operators/math/gemm/strategy.h
@@ -31,8 +31,9 @@ struct SgemmStrategy {
                              Itype *, const bool);
  typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *,
                             const int);
-  typedef void (*WriteFunc)(const int, const int, const Otype *, const int,
+  typedef void (*WriteFunc)(const int, const int, const float alpha,
-                            Otype *, const int);
+                            const Otype *, const int, const float beta, Otype *,
+                            const int);
  packLhsFunc pack_lhs;
  packRhsFunc pack_rhs;

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/math/gru_compute.h"
 #include "common/types.h"
 #include "operators/math/activation.h"
-#include "operators/math/gemm.h"
+#include "operators/math/gemm/cblas.h"
 #include "operators/math/gru_cpu_kernel.h"
 namespace paddle_mobile {
@@ -29,35 +29,19 @@ struct GRUUnitFunctor<CPU, T> {
  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
                      const ActivationType active_node,
                      const ActivationType active_gate) {
-    Gemm gemm;
    if (value.prev_out_value) {
-#ifdef _OPENMP
+      cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f,
-      gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
+                  value.prev_out_value, frame_size, value.gate_weight,
-                     value.prev_out_value, frame_size, value.gate_weight,
+                  frame_size * 2, 1.f, value.gate_value, frame_size * 3);
-                     frame_size * 2, 1, value.gate_value, frame_size * 3, false,
-                     static_cast<float *>(nullptr));
-#else
-      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
-                 value.prev_out_value, frame_size, value.gate_weight,
-                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
-                 static_cast<float *>(nullptr));
-#endif
    }
    forward_reset_output(value, frame_size, batch_size, active_gate);
    if (value.prev_out_value) {
-#ifdef _OPENMP
+      cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f,
-      gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
+                  value.reset_output_value, frame_size, value.state_weight,
-                     value.reset_output_value, frame_size, value.state_weight,
+                  frame_size, 1.f, value.gate_value + frame_size * 2,
-                     frame_size, 1, value.gate_value + frame_size * 2,
+                  frame_size * 3);
-                     frame_size * 3, false, static_cast<float *>(nullptr));
-#else
-      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
-                 value.reset_output_value, frame_size, value.state_weight,
-                 frame_size, 1, value.gate_value + frame_size * 2,
-                 frame_size * 3, false, static_cast<float *>(nullptr));
-#endif
    }
    forward_final_output(value, frame_size, batch_size, active_node);
@@ -65,6 +49,7 @@ struct GRUUnitFunctor<CPU, T> {
 };
 template struct GRUUnitFunctor<CPU, float>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -71,34 +71,11 @@ void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
        a[index++] = tmp[i * n + j];
      }
    }
-    if (M == 1) {
+    cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data<float>(), N,
-#ifdef _OPENMP
+                beta, matrix_out->data<float>(), N);
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-                     matrix_out->data<float>(), N, relu, bias);
-#else
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-                 matrix_out->data<float>(), N, relu, bias);
-#endif
-    } else {
-      cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data<float>(), N,
-                  beta, matrix_out->data<float>(), N);
-    }
  } else {
-    if (M == 1) {
+    cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data<float>(), K,
-#ifdef _OPENMP
+                matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
-                     matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
-                     N, relu, bias);
-#else
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
-                 matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
-                 relu, bias);
-#endif
-    } else {
-      cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data<float>(), K,
-                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
-                  N);
-    }
  }
 }

--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
@@ -803,9 +803,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "dup        v15.4s,    wzr                 \n"
            "cmp        %[inter], #0                       \n"
-            "ble        loop_1c_%=                         \n"
+            "ble        2f                                 \n"
            // loop 2 channels
-            "loop_2c_%=:                                   \n"
+            "1:                                            \n"
            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
            "ld1        {v4.4s, v5.4s}, [%[in_ptr]], #32   \n"
@@ -829,12 +829,12 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "fmla       v15.4s, v5.4s, v1.s[3]             \n"
            "subs       %[inter], %[inter], #1             \n"
-            "bne        loop_2c_%=                         \n"
+            "bne        1b                                 \n"
            // loop 1 channel
-            "loop_1c_%=:                                   \n"
+            "2:                                            \n"
            "cmp        %[remain], #0                      \n"
-            "ble        store_res_%=                       \n"
+            "ble        3f                                 \n"
            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
@@ -847,7 +847,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "fmla       v14.4s, v2.4s, v0.s[3]             \n"
            "fmla       v15.4s, v3.4s, v0.s[3]             \n"
-            "store_res_%=:                                 \n"
+            "3:                                            \n"
            "st1        {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n"
            "st1        {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n"
            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),

--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "dim.h"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done