提交 73a7f138 编写于 作者: H hjchen2

Revert package lhs in sgemm and depthwise conv5x5 to make it no problem on ios

上级 269a3a37
......@@ -42,6 +42,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
} else {
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
}
return DataLayout::kNCHW;
}
inline std::string DataLayoutToString(const DataLayout &data_layout) {
......
......@@ -82,6 +82,8 @@ struct Dim<0> {
int64_t &operator[](int idx);
int64_t operator[](int idx) const;
int64_t head;
};
namespace {
......@@ -131,6 +133,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
template <>
int64_t &indexer<0>(Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
return dim.head;
}
template <int D>
......@@ -147,6 +150,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
template <>
int64_t indexer<0>(const Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
return dim.head;
}
} // namespace
......
......@@ -201,16 +201,16 @@ inline void DepthwiseConv5x5(const ConvParam<CPU> &param) {
Tensor *output = param.Output();
output->mutable_data<Otype>();
if (strides[0] == 1) {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
&out_batch);
}
} else {
GemmConv<Itype, Otype>(param);
}
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv<Itype, Otype>(param);
// }
}
template <typename ParamType>
......
......@@ -144,20 +144,21 @@ void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
const float *input_data = input.data<float>();
const float *filter_data = filter.data<float>();
float *out_data = output->mutable_data<float>();
int input_h = input.dims()[2];
int input_w = input.dims()[3];
int output_h = output->dims()[2];
int output_w = output->dims()[3];
int padding_h = paddings[0];
int padding_w = paddings[1];
int image_size = input_h * input_w;
int out_image_size = output_h * output_w;
int valid_h_start = padding_h;
int valid_h_end = output_h - valid_h_start;
int valid_h = valid_h_end - valid_h_start;
int valid_w_start = padding_w;
int valid_w_end = output_w - valid_w_start;
int valid_w = valid_w_end - valid_w_start;
const int input_h = input.dims()[2];
const int input_w = input.dims()[3];
const int output_h = output->dims()[2];
const int output_w = output->dims()[3];
const int padding_h = paddings[0];
const int padding_w = paddings[1];
const int image_size = input_h * input_w;
const int out_image_size = output_h * output_w;
const int valid_h_start = padding_h;
const int valid_h_end = output_h - valid_h_start;
const int valid_h = valid_h_end - valid_h_start;
const int valid_w_start = padding_w;
const int valid_w_end = output_w - valid_w_start;
const int valid_w = valid_w_end - valid_w_start;
#pragma omp parallel for
for (int g = 0; g < input.dims()[1]; ++g) {
......
......@@ -18,7 +18,8 @@ limitations under the License. */
#ifdef _OPENMP
#include <omp.h>
#endif
#include <sys/time.h>
// #include <sys/time.h>
// #include <iostream>
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm/cpu_info.h"
......@@ -158,7 +159,8 @@ class GemmExecutor : public Executor {
}
}
}
strategy_.write(lhs_range, N_, local_C, ldc_, C + lhs_block * ldc, ldc);
strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta,
C + lhs_block * ldc, ldc);
}
} else {
strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true);
......@@ -188,7 +190,8 @@ class GemmExecutor : public Executor {
}
}
}
strategy_.write(M_, rhs_range, local_C, ldc_, C + rhs_block, ldc);
strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta,
C + rhs_block, ldc);
}
}
......
......@@ -31,8 +31,9 @@ struct SgemmStrategy {
Itype *, const bool);
typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *,
const int);
typedef void (*WriteFunc)(const int, const int, const Otype *, const int,
Otype *, const int);
typedef void (*WriteFunc)(const int, const int, const float alpha,
const Otype *, const int, const float beta, Otype *,
const int);
packLhsFunc pack_lhs;
packRhsFunc pack_rhs;
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "operators/math/gru_compute.h"
#include "common/types.h"
#include "operators/math/activation.h"
#include "operators/math/gemm.h"
#include "operators/math/gemm/cblas.h"
#include "operators/math/gru_cpu_kernel.h"
namespace paddle_mobile {
......@@ -29,35 +29,19 @@ struct GRUUnitFunctor<CPU, T> {
static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
const ActivationType active_node,
const ActivationType active_gate) {
Gemm gemm;
if (value.prev_out_value) {
#ifdef _OPENMP
gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false,
static_cast<float *>(nullptr));
#else
gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false,
static_cast<float *>(nullptr));
#endif
cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f,
value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1.f, value.gate_value, frame_size * 3);
}
forward_reset_output(value, frame_size, batch_size, active_gate);
if (value.prev_out_value) {
#ifdef _OPENMP
gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, static_cast<float *>(nullptr));
#else
gemm.Sgemm(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, static_cast<float *>(nullptr));
#endif
cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f,
value.reset_output_value, frame_size, value.state_weight,
frame_size, 1.f, value.gate_value + frame_size * 2,
frame_size * 3);
}
forward_final_output(value, frame_size, batch_size, active_node);
......@@ -65,6 +49,7 @@ struct GRUUnitFunctor<CPU, T> {
};
template struct GRUUnitFunctor<CPU, float>;
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......
......@@ -71,34 +71,11 @@ void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
a[index++] = tmp[i * n + j];
}
}
if (M == 1) {
#ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias);
#else
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias);
#endif
} else {
cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
}
cblas_sgemm(false, false, M, N, K, alpha, a, K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
} else {
if (M == 1) {
#ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N, relu, bias);
#else
gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, bias);
#endif
} else {
cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N);
}
cblas_sgemm(false, false, M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
}
}
......
......@@ -803,9 +803,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"dup v15.4s, wzr \n"
"cmp %[inter], #0 \n"
"ble loop_1c_%= \n"
"ble 2f \n"
// loop 2 channels
"loop_2c_%=: \n"
"1: \n"
"ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n"
"ld1 {v4.4s, v5.4s}, [%[in_ptr]], #32 \n"
......@@ -829,12 +829,12 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"fmla v15.4s, v5.4s, v1.s[3] \n"
"subs %[inter], %[inter], #1 \n"
"bne loop_2c_%= \n"
"bne 1b \n"
// loop 1 channel
"loop_1c_%=: \n"
"2: \n"
"cmp %[remain], #0 \n"
"ble store_res_%= \n"
"ble 3f \n"
"ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n"
......@@ -847,7 +847,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"fmla v14.4s, v2.4s, v0.s[3] \n"
"fmla v15.4s, v3.4s, v0.s[3] \n"
"store_res_%=: \n"
"3: \n"
"st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n"
"st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n"
: [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
......
......@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "dim.h"); do
cpplint $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册