提交 f2423c2d 编写于 作者: D dolphin8

merge with upstream

......@@ -9,7 +9,7 @@ option(LOG_PROFILE "log profile" ON)
option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
set(DEBUGING ON)
if (CPU)
add_definitions(-DPADDLE_MOBILE_CPU)
endif()
......@@ -28,7 +28,7 @@ set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
if (DEBUGING)
message(STATUS "debug")
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
add_definitions(-DPADDLE_MOBILE_DEBUG)
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
add_definitions(-DARMV7)
......@@ -36,6 +36,7 @@ if (DEBUGING)
endif ()
else ()
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif ()
......
......@@ -63,6 +63,7 @@ class OperatorBase {
std::vector<string> GetOutKeys() const;
virtual void RunImpl() const = 0;
virtual void Init() const = 0;
/*
* @b op 运算所需的输入, 如上一层的输出结果、卷积核
* */
......@@ -110,15 +111,17 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
param_(inputs, outputs, attrs, *scope) {
PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), " %s kernel init failed",
this->type_.c_str());
}
param_(inputs, outputs, attrs, *scope) {}
virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
void Init() const {
PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), " %s kernel init failed",
this->type_.c_str());
}
protected:
KernelType kernel_;
ParamType param_;
......
......@@ -198,6 +198,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
} else {
InitMemory();
}
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
op->Init();
}
}
template <typename Dtype, Precision P>
......@@ -416,6 +423,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
// to Run
ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
......
......@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() const {}
void InferShape() const {
auto out_dims = param_.Out()->dims();
out_dims[0] = param_.BatchSize();
......
......@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() const {}
void InferShape() const {
auto x_dims = param_.InputX()->dims();
param_.Out()->Resize(x_dims);
......
......@@ -21,6 +21,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
return true;
}
template <>
void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
const Tensor *input_x = param.InputX();
......
......@@ -110,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box,
}
}
template <>
bool BoxCoderKernel<CPU, float>::Init(const BoxCoderParam& para) const {
return true;
}
template <>
void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
const auto* input_priorbox = param.InputPriorBox();
......
......@@ -52,6 +52,11 @@ class ConcatFunctor {
}
};
template <>
bool ConcatKernel<CPU, float>::Init(const ConcatParam &para) const {
return true;
}
template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
auto inputs = param.Inputs();
......
......@@ -18,6 +18,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
return true;
}
template <>
void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
const Tensor *input = param.Input();
......
......@@ -19,6 +19,12 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddReluKernel<CPU, float>::Init(
const FusionConvAddReluParam &para) const {
return true;
}
template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam &param) const {
......
......@@ -19,6 +19,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
return true;
}
template <>
void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
const Tensor *input = param.Input();
......
......@@ -20,6 +20,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
return true;
}
template <>
void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
LOG(kLOG_DEBUG) << param;
......
......@@ -26,6 +26,12 @@ struct AddFunctor {
inline T operator()(T a, T b) const { return a + b; }
};
template <>
bool ElementwiseAddKernel<CPU, float>::Init(
const ElementwiseAddParam &para) const {
return true;
}
template <>
void ElementwiseAddKernel<CPU, float>::Compute(
const ElementwiseAddParam &param) const {
......
......@@ -21,6 +21,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcKernel<CPU, float>::Init(const FusionFcParam &para) const {
return true;
}
template <>
void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
const Tensor *input_x = param.InputX();
......
......@@ -21,6 +21,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool LrnKernel<CPU, float>::Init(const LrnParam &para) const {
return true;
}
template <>
void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
const Tensor *input_x = param.InputX();
......
......@@ -21,6 +21,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<CPU, float>::Init(const MulParam &para) const {
return true;
}
template <>
void MulKernel<CPU, float>::Compute(const MulParam &param) const {
const Tensor *input_x = param.InputX();
......
......@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
}
}
template <>
bool MultiClassNMSKernel<CPU, float>::Init(
const MultiClassNMSParam& para) const {
return true;
}
template <>
void MultiClassNMSKernel<CPU, float>::Compute(
const MultiClassNMSParam& param) const {
......
......@@ -35,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
}
}
template <>
bool PoolKernel<CPU, float>::Init(const PoolParam &para) const {
return true;
}
template <>
void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
const Tensor *in_x = param.Input();
......
......@@ -26,6 +26,11 @@ struct ClipFunctor {
}
};
template <>
bool PriorBoxKernel<CPU, float>::Init(const PriorBoxParam &para) const {
return true;
}
template <>
void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
const auto *input_ = param.Input();
......
......@@ -25,6 +25,11 @@ struct ReluFunctor {
inline T operator()(T in) const { return in > 0 ? in : 0; }
};
template <>
bool ReluKernel<CPU, float>::Init(const ReluParam &para) const {
return true;
}
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
......
......@@ -19,6 +19,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ReshapeKernel<CPU, float>::Init(const ReshapeParam &para) const {
return true;
}
template <>
void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
const auto *input_x = param.InputX();
......
......@@ -71,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) {
#endif
}
template <>
bool SigmoidKernel<CPU, float>::Init(const SigmoidParam &para) const {
return true;
}
template <>
void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
const Tensor *in_x = param.InputX();
......
......@@ -19,6 +19,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool SoftmaxKernel<CPU, float>::Init(const SoftmaxParam &para) const {
return true;
}
template <>
void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
const Tensor *in_x = param.InputX();
......
......@@ -34,6 +34,11 @@ namespace operators {
// }
// }
template <>
bool TransposeKernel<CPU, float>::Init(const TransposeParam& para) const {
return true;
}
template <>
void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
const auto* input_x = param.InputX();
......
......@@ -29,6 +29,7 @@ class BatchNormKernel
: public framework::OpKernelBase<DeviceType, BatchNormParam> {
public:
void Compute(const BatchNormParam &param) const;
bool Init(const BatchNormParam &para) const;
};
} // namespace operators
......
......@@ -30,6 +30,7 @@ class BoxCoderKernel
: public framework::OpKernelBase<DeviceType, BoxCoderParam> {
public:
void Compute(const BoxCoderParam& param) const;
bool Init(const BoxCoderParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
public:
void Compute(const ConcatParam &param) const;
bool Init(const ConcatParam &para) const;
};
} // namespace operators
......
......@@ -38,6 +38,7 @@ template <typename DeviceType, typename T>
class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
public:
void Compute(const FusionConvAddParam &param) const;
bool Init(const FusionConvAddParam &para) const;
};
} // namespace operators
......
......@@ -36,6 +36,7 @@ class ConvAddReluKernel
: public OpKernelBase<DeviceType, FusionConvAddReluParam> {
public:
void Compute(const FusionConvAddReluParam &param) const;
bool Init(const FusionConvAddReluParam &para) const;
};
} // namespace operators
......
......@@ -32,6 +32,7 @@ template <typename DeviceType, typename T>
class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
public:
void Compute(const ConvParam &param) const;
bool Init(const ConvParam &para) const;
};
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
......
......@@ -31,6 +31,7 @@ template <typename DeviceType, typename T>
class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
public:
void Compute(const ConvParam &param) const;
bool Init(const ConvParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -30,6 +30,7 @@ class ElementwiseAddKernel
: public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
public:
void Compute(const ElementwiseAddParam &param) const;
bool Init(const ElementwiseAddParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -19,6 +19,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ConvKernel<FPGA, float>::Init(const ConvParam &para) const {
return true;
}
template <>
void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
template class ConvKernel<FPGA, float>;
......
......@@ -28,6 +28,7 @@ class FusionFcKernel
: public framework::OpKernelBase<DeviceType, FusionFcParam> {
public:
void Compute(const FusionFcParam& param) const;
bool Init(const FusionFcParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -170,6 +170,7 @@ template <typename DeviceType, typename T>
class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
public:
void Compute(const LrnParam &param) const;
bool Init(const LrnParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -20,6 +20,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam &para) const {
return true;
}
template <>
void BatchNormKernel<GPU_MALI, float>::Compute(
const BatchNormParam &param) const {}
......
......@@ -19,6 +19,11 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <>
bool ConvKernel<GPU_MALI, float>::Init(const ConvParam &para) const {
return true;
}
template <>
void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
// ArmConvImplement imp;
......
......@@ -29,6 +29,7 @@ template <typename DeviceType, typename T>
class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
public:
void Compute(const MulParam &param) const;
bool Init(const MulParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -28,6 +28,7 @@ class MultiClassNMSKernel
: public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
public:
void Compute(const MultiClassNMSParam& param) const;
bool Init(const MultiClassNMSParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -28,6 +28,7 @@ template <typename DeviceType, typename T>
class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
public:
void Compute(const PoolParam &param) const override;
bool Init(const PoolParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -55,6 +55,7 @@ class PriorBoxKernel
: public framework::OpKernelBase<DeviceType, PriorBoxParam> {
public:
void Compute(const PriorBoxParam& param) const;
bool Init(const PriorBoxParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
public:
void Compute(const ReluParam& param) const;
bool Init(const ReluParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -71,6 +71,7 @@ template <typename DeviceType, typename T>
class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
public:
void Compute(const ReshapeParam& param) const;
bool Init(const ReshapeParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -26,6 +26,7 @@ template <typename DeviceType, typename T>
class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
public:
void Compute(const SigmoidParam& param) const override;
bool Init(const SigmoidParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -29,6 +29,7 @@ template <typename DeviceType, typename T>
class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
public:
void Compute(const SoftmaxParam &param) const override;
bool Init(const SoftmaxParam &para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -29,6 +29,7 @@ class TransposeKernel
: public framework::OpKernelBase<DeviceType, TransposeParam> {
public:
void Compute(const TransposeParam& param) const;
bool Init(const TransposeParam& para) const;
};
} // namespace operators
} // namespace paddle_mobile
......
......@@ -26,12 +26,12 @@ alignas(64) float packedA[MC * KC];
alignas(64) float packedB[KC * NC];
alignas(64) float ab[MR * NR];
// 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
int i, j;
const float *Aij;
for (i = 0; i < m - paddingM; i += MR) {
for (int j = 0; j < k; ++j) {
for (i = 0; i < m - m_tail; i += MR) {
for (j = 0; j < k; ++j) {
Aij = &A(i, j);
*buffer++ = *Aij;
*buffer++ = *(Aij + 1);
......@@ -39,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
*buffer++ = *(Aij + 3);
}
}
if (paddingM != 0) {
if (m_tail != 0) {
for (j = 0; j < k; ++j) {
Aij = &A(m - paddingM, j);
for (i = 0; i < paddingM; ++i) {
Aij = &A(m - m_tail, j);
for (i = 0; i < m_tail; ++i) {
*buffer++ = *(Aij + i);
}
for (i = paddingM; i < MR; ++i) {
for (i = m_tail; i < MR; ++i) {
*buffer++ = 0;
}
}
......@@ -53,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
}
// 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
int i, j;
const float *Ai, *Ai1, *Ai2, *Ai3;
for (i = 0; i < m - paddingM; i += MR) {
for (i = 0; i < m - m_tail; i += MR) {
Ai = &A(i, 0);
Ai1 = &A(i + 1, 0);
Ai2 = &A(i + 2, 0);
......@@ -69,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
*buffer++ = *Ai3++;
}
}
if (paddingM != 0) {
if (m_tail != 0) {
for (j = 0; j < k; ++j) {
for (i = m - paddingM; i < m; ++i) {
for (i = m - m_tail; i < m; ++i) {
*buffer++ = A(i, j);
}
for (i = m; i < m + (MR - paddingM); ++i) {
for (i = m; i < m + (MR - m_tail); ++i) {
*buffer++ = 0;
}
}
......@@ -82,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
}
// 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - paddingN; j += NR) {
for (j = 0; j < n - n_tail; j += NR) {
Bj = &B(0, j);
Bj1 = &B(0, j + 1);
Bj2 = &B(0, j + 2);
......@@ -98,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
*buffer++ = *Bj3++;
}
}
if (paddingN != 0) {
if (n_tail != 0) {
for (i = 0; i < k; ++i) {
for (int j = n - paddingN; j < n; ++j) {
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = B(i, j);
}
for (int j = n; j < n + (NR - paddingN); ++j) {
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
......@@ -111,11 +111,11 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
}
// 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bij;
for (j = 0; j < n - paddingN; j += NR) {
for (j = 0; j < n - n_tail; j += NR) {
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
asm volatile(
......@@ -126,13 +126,13 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
: "memory", "q0");
}
}
if (paddingN != 0) {
if (n_tail != 0) {
for (i = 0; i < k; ++i) {
Bij = &B(i, n - paddingN);
for (int j = n - paddingN; j < n; ++j) {
Bij = &B(i, n - n_tail);
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = *Bij++;
}
for (int j = n; j < n + (NR - paddingN); ++j) {
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
......@@ -143,33 +143,25 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
int first_time) {
int Buff_A_M = m;
int Buff_B_N = n;
int m_block = (m + MR - 1) / MR * MR;
int n_block = (n + NR - 1) / NR * NR;
int _mc = m % MR;
int _nc = n % NR;
if (_mc != 0) {
Buff_A_M = m + (MR - _mc);
}
if (_nc != 0) {
Buff_B_N = n + (NR - _nc);
}
int m_tail = m % MR;
int n_tail = n % NR;
if (first_time) {
PackMatrixB_(k, n, _nc, B, ldb, packedB);
PackMatrixB_(k, n, n_tail, B, ldb, packedB);
}
PackMatrixA_(m, k, _mc, A, lda, packedA);
PackMatrixA_(m, k, m_tail, A, lda, packedA);
int i, j, mc, nc;
// B 取 4 列, 打包预热
for (j = 0; j < Buff_B_N; j += NR) {
nc = (n - j) < NR ? _nc : NR;
for (j = 0; j < n_block; j += NR) {
nc = (n - j) < NR ? n_tail : NR;
// A 取 4 行,打包预热
for (i = 0; i < Buff_A_M; i += MR) {
mc = (m - i) < MR ? _mc : MR;
for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? m_tail : MR;
AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc);
}
......@@ -180,36 +172,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
int first_time, bool relu = false) {
int Buff_A_M = m;
int Buff_B_N = n;
int _mc = m % MR;
int _nc = n % NR;
int m_block = (m + MR - 1) / MR * MR;
int n_block = (n + NR - 1) / NR * NR;
if (_mc != 0) {
Buff_A_M = m + (MR - _mc);
}
if (_nc != 0) {
Buff_B_N = n + (NR - _nc);
}
float packedA[MC * KC];
static float packedB[KC * NC];
int m_tail = m % MR;
int n_tail = n % NR;
if (first_time) {
PackMatrixB_(k, n, _nc, B, ldb, packedB);
PackMatrixB_(k, n, n_tail, B, ldb, packedB);
}
PackMatrixA_(m, k, _mc, A, lda, packedA);
PackMatrixA_(m, k, m_tail, A, lda, packedA);
int i, j, mc, nc;
// B 取 4 列, 打包预热
for (j = 0; j < Buff_B_N; j += NR) {
nc = (n - j) < NR ? _nc : NR;
for (j = 0; j < n_block; j += NR) {
nc = (n - j) < NR ? n_tail : NR;
// A 取 4 行,打包预热
for (i = 0; i < Buff_A_M; i += MR) {
mc = (m - i) < MR ? _mc : MR;
for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? m_tail : MR;
AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc, relu);
}
......@@ -375,12 +356,15 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
......@@ -525,12 +509,15 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
......@@ -578,10 +565,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t"
......@@ -599,7 +586,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
"q14");
if (mc != MR || nc != NR) {
int i, j;
......
......@@ -33,19 +33,19 @@ namespace operators {
namespace math {
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)]
......@@ -29,10 +31,15 @@ int main() {
int ldb = n;
int ldc = n;
float a[62 * 74];
float b[74 * 63];
float c[62 * 63] = {0};
float c1[62 * 63] = {0};
float *a =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
float *b =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
float *c =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float *c1 =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
for (int i = 0; i < m * k; ++i) {
a[i] = 2;
}
......@@ -44,8 +51,11 @@ int main() {
c1[i] = 2;
}
auto time1 = time();
paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
ldc);
auto time2 = time();
DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
for (int i = 0; i < m * n; ++i) {
std::cout << c[i] << " | ";
if (i % n == (n - 1)) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册