提交 c5f70926 编写于 作者: H hjchen2

Merge branch 'ocr_ctc' of https://github.com/hjchen2/paddle-mobile into ocr_ctc

...@@ -350,7 +350,7 @@ PMStatus Executor<Device, T>::Predict() { ...@@ -350,7 +350,7 @@ PMStatus Executor<Device, T>::Predict() {
_tp[ops_list_[i]->Type()] += timeCost; _tp[ops_list_[i]->Type()] += timeCost;
} }
} }
DLOG << "====================[ profile ]======================"; printf("====================[ profile ]======================\n");
typedef std::pair<std::string, uint64_t> prof_t; typedef std::pair<std::string, uint64_t> prof_t;
std::vector<prof_t> _tv(_tp.begin(), _tp.end()); std::vector<prof_t> _tv(_tp.begin(), _tp.end());
uint64_t _ptotal = 0; uint64_t _ptotal = 0;
...@@ -367,7 +367,7 @@ PMStatus Executor<Device, T>::Predict() { ...@@ -367,7 +367,7 @@ PMStatus Executor<Device, T>::Predict() {
static_cast<float>(p.second), static_cast<float>(p.second),
static_cast<float>(p.second) / _ptotal * 100.0); static_cast<float>(p.second) / _ptotal * 100.0);
} }
DLOG << "====================[---------]======================"; printf("====================[---------]======================\n");
#endif #endif
return PMSuccess; return PMSuccess;
} }
......
...@@ -25,12 +25,11 @@ limitations under the License. */ ...@@ -25,12 +25,11 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FillConstantOp : public framework::OperatorBase<DeviceType> { class FillConstantOp : public framework::OperatorBase<DeviceType> {
public: public:
FillConstantOp(const string &type, const VariableNameMap &inputs, FillConstantOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
...@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> { ...@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
tensor->Resize(framework::make_ddim(param_.Shape())); tensor->Resize(framework::make_ddim(param_.Shape()));
tensor->mutable_data(framework::ToTypeIndex(data_type)); tensor->mutable_data(framework::ToTypeIndex(data_type));
math::set_constant(tensor, value); math::SetConstant(tensor, value);
} }
void Init() {} void Init() {}
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP #ifdef FUSION_CONVADDADDPRELU_OP
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) { ...@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step); Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
float *biase_data1 = bias1_slice.data<float>(); float *biase_data1 = bias1_slice.data<float>();
// int n = bias1_slice.dims()[0]; math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
// int m = bias1_slice.dims()[1];
// for(int i=0;i<n*m;i++){
// if(biase_data1[i]!=0)
// DLOG<<biase_data1[i]<<",yangfei";
// }
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, biase_data1); p, mode, biase_data, biase_data1);
} }
} }
...@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) { ...@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // FUSION_CONVADDADDPRELU_OP
...@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) { ...@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float, float>(filter_slice, false, col_matrix, false, math::MatMul<float, float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data); static_cast<float>(1), false, biase_data);
} }
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) { void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
...@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) { ...@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
template <typename P> template <typename P>
void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) { void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
Tensor Bias; Tensor Bias;
...@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) { ...@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP #ifdef FUSION_CONVADDPRELU_OP
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
// DLOG<<"yangfei";
// DLOG<<bias.dims();
int axis = param.Axis(); int axis = param.Axis();
Tensor *output = param.Output(); Tensor *output = param.Output();
float *biase_data = bias.data<float>(); float *biase_data = bias.data<float>();
...@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
// math::matmul<float>(filter_slice, false, col_matrix, math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, nullptr); p, mode, biase_data, nullptr);
} }
} }
...@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // FUSION_CONVADDPRELU_OP
...@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha, math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
&out_slice, beta, true, bias_data); &out_slice, beta, true, bias_data);
} }
} }
......
...@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> &param) { ...@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(0), false, static_cast<float>(0), false,
static_cast<Otype *>(nullptr)); static_cast<Otype *>(nullptr));
......
...@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) { ...@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(filter_slice, false, col_matrix, false, math::MatMulWithBn(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), true, &new_scale, static_cast<float>(1), true, &new_scale, &new_bias, g,
&new_bias, g, bias_data.data<float>()); bias_data.data<float>());
} }
} }
} }
......
...@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) { ...@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
......
...@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) { ...@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmul<P, P>(filter_slice, true, in_slice, false, math::MatMul<P, P>(filter_slice, true, in_slice, false,
static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0)); static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
if (data_dim == 2U) { if (data_dim == 2U) {
col2im(col, dilations, strides, col2im(col, dilations, strides,
......
...@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) { ...@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
......
...@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) { ...@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
for (int i = 0; i < out_dim[0]; i++) { for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes); memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
} }
math::matmul<Itype, Otype>(x_matrix, false, y_matrix, false, math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), static_cast<float>(1), out, static_cast<float>(1),
false); false);
} }
......
...@@ -25,18 +25,16 @@ limitations under the License. */ ...@@ -25,18 +25,16 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using LoDTensor = framework::LoDTensor; template <typename Device, typename T>
using Tensor = framework::Tensor;
template <typename DeviceType, typename T>
inline void ReorderInitState(const framework::Tensor& src, inline void ReorderInitState(const framework::Tensor& src,
std::vector<size_t> index_lod, std::vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceType, T> row_shuffle; math::CopyMatrixRowsFunctor<Device, T> row_shuffle;
dst->mutable_data<T>(src.dims()); dst->mutable_data<T>(src.dims());
row_shuffle(src, index_lod, dst, indexed_src); row_shuffle(src, index_lod, dst, indexed_src);
} }
template <typename P>
template <typename T>
void GruCompute(const GruParam<CPU>& param) { void GruCompute(const GruParam<CPU>& param) {
auto* input = param.InputInput(); auto* input = param.InputInput();
auto* h0 = param.InputH0(); auto* h0 = param.InputH0();
...@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
bool is_reverse = param.IsReverse(); bool is_reverse = param.IsReverse();
math::LoDTensor2BatchFunctor<CPU, float> to_batch; math::LoDTensor2BatchFunctor<CPU, float> to_batch;
to_batch(*input, batch_gate, true, is_reverse); to_batch(*input, batch_gate, true, is_reverse);
// math::ClearTensor<CPU, float> clearTensor;
// clearTensor(batch_gate);
if (bias) { if (bias) {
math::RowwiseAdd<CPU, float> add_bias; math::RowwiseAdd<CPU, float> add_bias;
add_bias(*batch_gate, *bias, batch_gate); add_bias(*batch_gate, *bias, batch_gate);
...@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
gru_value.gate_weight = const_cast<float*>(weight_data); gru_value.gate_weight = const_cast<float*>(weight_data);
gru_value.state_weight = gru_value.state_weight =
const_cast<float*>(weight_data + 2 * frame_size * frame_size); const_cast<float*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; framework::Tensor ordered_h0;
std::vector<size_t> order(batch_gate->lod()[2]); std::vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
...@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend); // BUG framework::Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); framework::Tensor reset_hidden_prev_t =
Tensor hidden_t = batch_hidden->Slice(bstart, bend); batch_reset_hidden_prev->Slice(bstart, bend);
framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.output_value = hidden_t.data<float>(); gru_value.output_value = hidden_t.data<float>();
gru_value.gate_value = gate_t.data<float>(); gru_value.gate_value = gate_t.data<float>();
gru_value.reset_output_value = reset_hidden_prev_t.data<float>(); gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
...@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // GRU_OP
...@@ -19,40 +19,6 @@ limitations under the License. */ ...@@ -19,40 +19,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
// 1、如果x,y维度都是2维,
// x = [[1,2], y = [[5,6],
// [3,4]] [7,8]]
// 运算结果为正常矩阵相乘。结果 out =
// [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
//
// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
// x = [[[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]],
// [[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]]]
// y = [[[1,2]],
// [[3,4]],
// [[5,6]],
// [[7,8]]]
// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘,得到6,
// [x_num_col_dims,xdim.size())部分4相乘,得到4,
// 将Tensor x的dims重写成(6,4)
// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘,得到4,
// [y_num_col_dims,ydim.size())部分1,2相乘,得到2,
// 将Tensor y的dims重写成(4,2)
// 并不影响x,y在内存中的分布。
// x = [[1,2,3,4], y = [[1,2],
// [2,3,4,5], [3,4],
// [3,4,5,6], 矩阵乘法 [5,6],
// [1,2,3,4], [7,8]]
// [2,3,4,5],
// [3,4,5,6]]
// 结果x(6行4列)乘y(4行2列),按1中矩阵相乘,结果out(6行2列)
template <typename P> template <typename P>
void MulCompute(const MulParam<CPU> &param) { void MulCompute(const MulParam<CPU> &param) {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
...@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> &param) {
} }
if (param.InputX()->type() == typeid(int8_t)) { if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>(); out->mutable_data<int32_t>();
math::matmul<int8_t, int32_t>(x_matrix, false, y_matrix, false, math::MatMul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), out,
static_cast<float>(0)); static_cast<float>(0));
} else { } else {
out->mutable_data<float>(); out->mutable_data<float>();
math::matmul<float, float>(x_matrix, false, y_matrix, false, math::MatMul<float, float>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), out,
static_cast<float>(0)); static_cast<float>(0));
} }
......
...@@ -94,27 +94,19 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context, ...@@ -94,27 +94,19 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
} }
// for (int i = 0; i < out->numel(); i++) { math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1), false); out, static_cast<float>(1), false);
out_image->InitEmptyImage(context, commandQueue, out->dims()); out_image->InitEmptyImage(context, commandQueue, out->dims());
framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1); framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
DLOG << *out;
delete (input_x); delete (input_x);
delete (input_y); delete (input_y);
delete (input_z); delete (input_z);
delete (out); delete (out);
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
} }
template <> template <>
void FusionFcKernel<GPU_CL, float>::Compute( void FusionFcKernel<GPU_CL, float>::Compute(
const FusionFcParam<GPU_CL> &param) { const FusionFcParam<GPU_CL> &param) {
......
...@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute( ...@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
for (int i = 0; i < out->numel(); i++) { for (int i = 0; i < out->numel(); i++) {
DLOG << out_data[i]; DLOG << out_data[i];
} }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1)); out, static_cast<float>(1));
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) { // if (out_dim.size() != 2) {
......
...@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) { ...@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0)); out, static_cast<float>(0));
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize(out_dim); out->Resize(out_dim);
......
...@@ -38,7 +38,11 @@ limitations under the License. */ ...@@ -38,7 +38,11 @@ limitations under the License. */
* *
* (this is the zlib license) * (this is the zlib license)
*/ */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#pragma once #pragma once
#include <arm_neon.h> #include <arm_neon.h>
#define c_inv_mant_mask ~0x7f800000u #define c_inv_mant_mask ~0x7f800000u
...@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) { ...@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) { static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
float32x4_t reciprocal = vrecpeq_f32(b); float32x4_t reciprocal = vrecpeq_f32(b);
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return vmulq_f32(a, reciprocal); return vmulq_f32(a, reciprocal);
} }
static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) { static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
// pow(x, m) = exp(m * log(x))
return exp_ps(vmulq_f32(b, log_ps(a))); return exp_ps(vmulq_f32(b, log_ps(a)));
} }
#endif // __ARM_NEON__
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include <cstring>
#include <string> #include <string>
#include "common/enforce.h"
#include "framework/data_type.h" #include "framework/data_type.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
...@@ -35,13 +35,13 @@ struct TensorSetConstant { ...@@ -35,13 +35,13 @@ struct TensorSetConstant {
float value_; float value_;
}; };
void set_constant(framework::Tensor *tensor, float value) { void SetConstant(framework::Tensor *tensor, float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()), framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstant(tensor, value)); TensorSetConstant(tensor, value));
} }
template <> template <>
void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, float *bias) { float beta, bool relu, float *bias) {
...@@ -50,7 +50,7 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -50,7 +50,7 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -72,7 +72,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -72,7 +72,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
} }
#ifdef _OPENMP #ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta, gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#else #else
...@@ -92,19 +91,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -92,19 +91,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
} }
} }
template <> void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha,
const framework::Tensor &matrix_b, bool trans_b, framework::Tensor *matrix_out, float beta, bool relu,
float alpha, framework::Tensor *matrix_out, float beta, framework::Tensor *new_scale, framework::Tensor *new_bias,
bool relu, framework::Tensor *new_scale, int group, float *bias) {
framework::Tensor *new_bias, int group, float *bias) {
Gemm gemm; Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -122,7 +120,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -122,7 +120,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
new_bias->data<float>() + group, bias); new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1) { float *bias, float *bias1) {
...@@ -132,7 +130,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -132,7 +130,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -146,7 +144,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -146,7 +144,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, matrix_out->data<float>(), N, matrix_b.data<float>(), N, matrix_out->data<float>(), N,
p, mode, bias, bias1); p, mode, bias, bias1);
#endif #endif
} }
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <cmath>
#include <string> #include <string>
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -22,37 +21,37 @@ namespace paddle_mobile { ...@@ -22,37 +21,37 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
void set_constant(framework::Tensor *tensor, float value); void SetConstant(framework::Tensor *tensor, float value);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void MatMul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu = false, framework::Tensor *matrix_out, float beta, bool relu = false,
Otype *bias = nullptr); Otype *bias = nullptr);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void MatMul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, Otype *bias, framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
bool addOnRow); bool addOnRow);
template <typename T> void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, framework::Tensor *matrix_out, float beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias, framework::Tensor *new_scale, framework::Tensor *new_bias,
int group, T *bias = nullptr); int group, float *bias = nullptr);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1); float *bias, float *bias1);
template <typename DeviceType, typename T>
template <typename Device, typename T>
struct ClearTensor { struct ClearTensor {
void operator()(framework::Tensor *tensor); void operator()(framework::Tensor *tensor);
}; };
template <typename DeviceType, typename T> template <typename Device, typename T>
struct RowwiseAdd { struct RowwiseAdd {
void operator()(const framework::Tensor &input, const framework::Tensor &vec, void operator()(const framework::Tensor &input, const framework::Tensor &vec,
framework::Tensor *output); framework::Tensor *output);
......
...@@ -22,7 +22,7 @@ namespace operators { ...@@ -22,7 +22,7 @@ namespace operators {
namespace math { namespace math {
template <> template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias, float beta, bool relu, int32_t *bias,
...@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int32_t M = dim_out[0]; int32_t M = dim_out[0];
int32_t N = dim_out[1]; int32_t N = dim_out[1];
...@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
} }
template <> template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias) { float beta, bool relu, int32_t *bias) {
matmul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha, MatMul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
matrix_out, beta, relu, bias, false); matrix_out, beta, relu, bias, false);
} }
......
...@@ -15,154 +15,131 @@ limitations under the License. */ ...@@ -15,154 +15,131 @@ limitations under the License. */
#ifdef SOFTMAX_OP #ifdef SOFTMAX_OP
#include "operators/math/softmax.h" #include "operators/math/softmax.h"
#include "common/types.h"
#ifdef __ARM_NEON
#include <math.h> #include <math.h>
#include <algorithm> #include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math_func_neon.h" #include "operators/math/math_func_neon.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
using framework::DDim;
using framework::Tensor;
template <typename T>
class SoftmaxFuntor<CPU, T> {
#ifdef __ARM_NEON
void sum(float *input, float *sumptr, int inner_size, int outter_size) {
float32x4_t acc = vdupq_n_f32(0);
float sum_ = 0;
for (int i = 0; i < outter_size; ++i) {
float *input_outer_ptr = input + i * inner_size;
int nn = inner_size >> 2;
int left = inner_size - (nn << 2);
for (; nn > 0; nn--) {
float32x4_t vec_input = vld1q_f32(input_outer_ptr);
acc = vaddq_f32(acc, vec_input);
input_outer_ptr += 4;
}
float32x2_t vsum_ = vadd_f32(vget_high_f32(acc), vget_low_f32(acc));
sum_ = vget_lane_f32(vsum_, 0) + vget_lane_f32(vsum_, 1);
for (; left > 0; left--) {
sum_ += *input_outer_ptr;
input_outer_ptr++;
}
}
for (int j = 0; j < inner_size * outter_size; ++j) {
sumptr[j] = sum_;
}
}
void SoftmaxCacl(const Tensor *X, Tensor *Y) { #if defined(__ARM_NEON) || defined(__ARM_NEON__)
const float *input = X->data<float>(); #ifndef __aarch64__
const DDim &dDim = X->dims(); inline float32_t vmaxvq_f32(const float32x4_t &r) {
int axis_index = 1; float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
if (dDim.size() < 4) { return vget_lane_f32(vpmax_f32(v, v), 0);
axis_index = 0; }
}
DDim outer_ddim =
paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
int out_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim);
auto *max_ptr = new float[inner_size * out_size];
// max
for (int j = 0; j < out_size; ++j) {
const float *input_outer_ptr = input + j * inner_size;
float *max_outer_ptr = max_ptr + j * inner_size;
float max_ = 0;
for (int i = 0; i < inner_size; ++i) {
const float *input_inner_ptr = input_outer_ptr + i;
max_ = std::max(max_, input_inner_ptr[0]);
}
for (int k = 0; k < inner_size; ++k) {
max_outer_ptr[k] = max_;
}
}
// exp(value - max)
float *exp_sub_max = new float[inner_size * out_size];
float *exp_sub_max_ptr = &exp_sub_max[0];
for (int l = 0; l < out_size; ++l) {
const float *input_outer_ptr = input + l * inner_size;
float *max_outer_ptr = max_ptr + l * inner_size;
int nn = inner_size >> 2;
int left = inner_size - (nn << 2);
for (; nn > 0; nn--) {
float32x4_t vec_input = vld1q_f32(input_outer_ptr);
float32x4_t vec_max = vld1q_f32(max_outer_ptr);
float32x4_t vec_sub = vsubq_f32(vec_input, vec_max);
float32x4_t vec_exp = exp_ps(vec_sub);
vst1q_f32(exp_sub_max_ptr, vec_exp);
input_outer_ptr += 4;
max_outer_ptr += 4;
exp_sub_max_ptr += 4;
}
for (; left > 0; left--) {
*exp_sub_max_ptr = expf(*input_outer_ptr - *max_outer_ptr);
input_outer_ptr++; inline float32_t vaddvq_f32(const float32x4_t &r) {
max_outer_ptr++; float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
exp_sub_max_ptr++; return vget_lane_f32(vpadd_f32(v, v), 0);
} }
} #endif // __aarch64__
float *sumptr = new float[inner_size * out_size]; #endif // __ARM_NEON__
// sum exp
sum(exp_sub_max, sumptr, inner_size, out_size); float find_max(const float *input, const int num_classes) {
// div int remain = num_classes;
auto *out_ptr = Y->mutable_data<float>(); float max = -std::numeric_limits<float>::max();
for (int l = 0; l < out_size; ++l) { #if defined(__ARM_NEON) || defined(__ARM_NEON__)
const float *input_outer_ptr = exp_sub_max + l * inner_size; int loop = num_classes >> 3;
float *output_outer_ptr = out_ptr + l * inner_size; remain = num_classes & 0x7;
float *sum_outer_ptr = sumptr + l * inner_size; float32x4_t __max = vdupq_n_f32(max);
int nn = inner_size >> 2; for (int i = 0; i < loop; ++i, input += 8) {
int left = inner_size - (nn << 2); float32x4_t x0 = vld1q_f32(input);
for (; nn > 0; nn--) { float32x4_t x1 = vld1q_f32(input + 4);
float32x4_t vec_input = vld1q_f32(input_outer_ptr); __max = vmaxq_f32(x0, __max);
float32x4_t vec_sum = vld1q_f32(sum_outer_ptr); __max = vmaxq_f32(x1, __max);
float32x4_t vec_div = div_ps(vec_input, vec_sum); }
vst1q_f32(output_outer_ptr, vec_div); max = vmaxvq_f32(__max);
input_outer_ptr += 4; #endif
output_outer_ptr += 4; for (int i = 0; i < remain; ++i) {
sum_outer_ptr += 4; max = std::max(max, input[i]);
} }
for (; left > 0; left--) { return max;
*output_outer_ptr = (*input_outer_ptr) / (*sum_outer_ptr); }
input_outer_ptr++;
output_outer_ptr++; template <>
sum_outer_ptr++; void SoftmaxFuntor<CPU, float>::operator()(const framework::Tensor *X,
framework::Tensor *Y) {
const framework::DDim &dims = X->dims();
int batch_size = dims[0];
int num_classes = dims[dims.size() - 1];
int channels = X->numel() / batch_size / num_classes;
const float *x = X->data<float>();
float *y = Y->mutable_data<float>();
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < X->dims()[0]; ++batch) {
for (int channel = 0; channel < channels; ++channel) {
size_t offset = (batch * channels + channel) * num_classes;
const float *input = x + offset;
float *output = y + offset;
// find max
float max = find_max(input, num_classes);
// exp(x - max)
int remain = num_classes;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int loop = num_classes >> 3;
remain = num_classes & 0x7;
float32x4_t __max = vdupq_n_f32(max);
for (int i = 0; i < loop; ++i, input += 8, output += 8) {
float32x4_t x0 = vld1q_f32(input);
float32x4_t x1 = vld1q_f32(input + 4);
x0 = vsubq_f32(x0, __max);
x1 = vsubq_f32(x1, __max);
x0 = exp_ps(x0);
x1 = exp_ps(x1);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x1);
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; ++i) {
output[i] = std::expf(input[i] - max);
} }
// sum(exp(x - max))
float sum = 0.f;
output = y + offset;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __sum = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
__sum = vaddq_f32(x0, __sum);
__sum = vaddq_f32(x1, __sum);
}
sum += vaddvq_f32(__sum);
#endif // __ARM_NEON__
for (int i = 0; i < remain; ++i) {
sum += output[i];
} }
// exp(x - max) / sum
float inv_sum = 1.f / sum;
output = y + offset;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
x0 = vmulq_f32(x0, __inv_sum);
x1 = vmulq_f32(x1, __inv_sum);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x0);
} }
#else
#endif // ARM_NEON
public:
void operator()(const framework::Tensor *X, framework::Tensor *Y) {
const DDim dDim = X->dims();
int dim1 = dDim[dDim.size() - 1];
int dim0 = X->numel() / dim1 / dDim[0];
framework::DDim matrix_shape = {dim0, dim1};
for (int i = 0; i < dDim[0]; ++i) {
framework::Tensor sub_X = X->Slice(i, i + 1);
framework::Tensor sub_Y = Y->Slice(i, i + 1);
sub_X.Resize(matrix_shape);
sub_Y.Resize(matrix_shape);
for (int j = 0; j < dim0; j++) {
framework::Tensor sub_x = sub_X.Slice(j, j + 1);
framework::Tensor sub_y = sub_Y.Slice(j, j + 1);
#ifdef __ARM_NEON
SoftmaxCacl(&sub_x, &sub_y);
#endif #endif
for (int i = 0; i < remain; ++i) {
output[i] *= inv_sum;
} }
} }
} }
}; }
template class SoftmaxFuntor<CPU, float>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
#endif // SOFTMAX_OP
...@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP #ifdef SOFTMAX_OP
#pragma once #pragma once
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceType, typename T> template <typename Device, typename T>
class SoftmaxFuntor { class SoftmaxFuntor {
public: public:
void operator()(const framework::Tensor *X, framework::Tensor *Y); void operator()(const framework::Tensor *X, framework::Tensor *Y);
}; };
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH) ...@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp) ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
target_link_libraries(test-inference-api paddle-mobile) target_link_libraries(test-inference-api paddle-mobile)
# gen test log
# gen test # gen test
ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp) ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
target_link_libraries(test-optimize paddle-mobile) target_link_libraries(test-optimize paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool-op paddle-mobile) target_link_libraries(test-pool-op paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-softmax paddle-mobile) target_link_libraries(test-softmax-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
......
...@@ -73,14 +73,14 @@ int main() { ...@@ -73,14 +73,14 @@ int main() {
// float // float
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, float>( paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr); false, nullptr);
} }
auto time_start0 = time(); auto time_start0 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, float>( paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr); false, nullptr);
} }
...@@ -91,14 +91,14 @@ int main() { ...@@ -91,14 +91,14 @@ int main() {
// int8_t without bias // int8_t without bias
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0)); static_cast<float>(0));
} }
auto time_start1 = time(); auto time_start1 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0)); static_cast<float>(0));
} }
...@@ -109,13 +109,13 @@ int main() { ...@@ -109,13 +109,13 @@ int main() {
// int8_t with bias, column element wise add // int8_t with bias, column element wise add
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false); static_cast<float>(0), false, bias_data_col, false);
} }
auto time_start2 = time(); auto time_start2 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false); static_cast<float>(0), false, bias_data_col, false);
} }
...@@ -126,13 +126,13 @@ int main() { ...@@ -126,13 +126,13 @@ int main() {
// int8_t with bias, row element wise add // int8_t with bias, row element wise add
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true); static_cast<float>(0), false, bias_data_row, true);
} }
auto time_start3 = time(); auto time_start3 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true); static_cast<float>(0), false, bias_data_row, true);
} }
...@@ -143,13 +143,13 @@ int main() { ...@@ -143,13 +143,13 @@ int main() {
// int8_t with bias&relu // int8_t with bias&relu
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false); static_cast<float>(0), true, bias_data_col, false);
} }
auto time_start4 = time(); auto time_start4 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false); static_cast<float>(0), true, bias_data_col, false);
} }
......
...@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <math.h>
#include <limits>
#include "../test_include.h" #include "../test_include.h"
#include "operators/softmax_op.h" #include "operators/softmax_op.h"
int main() { namespace paddle_mobile {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_mobilenet)); void Softmax(const framework::Tensor *X, framework::Tensor *Y) {
if (program.originProgram == nullptr) { const framework::DDim &dims = X->dims();
DLOG << "program read file"; int batch_size = dims[0];
int num_classes = dims[dims.size() - 1];
int channels = X->numel() / batch_size / num_classes;
const float *x = X->data<float>();
float *y = Y->mutable_data<float>();
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
size_t offset = (batch * channels + c) * num_classes;
const float *input = x + offset;
float *output = y + offset;
float max = -std::numeric_limits<float>::max();
for (int j = 0; j < num_classes; ++j) {
max = (input[j] > max) ? input[j] : max;
}
float sum = 0.f;
for (int j = 0; j < num_classes; ++j) {
float tmp = std::expf(input[j] - max);
sum += tmp;
output[j] = tmp;
}
for (int j = 0; j < num_classes; ++j) {
output[j] /= sum;
}
}
}
}
int TestSoftmaxOp(const std::vector<int> input_shape) {
framework::DDim dims = framework::make_ddim(input_shape);
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["X"] = std::vector<std::string>({"input"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<float>(input, dims, -100.0, 100.0);
auto output_var = scope.get()->Var("output");
auto output = output_var->template Get<framework::LoDTensor>();
framework::AttributeMap attrs;
auto *op = new operators::SoftmaxOp<CPU, float>("softmax", inputs, outputs,
attrs, scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
Softmax(input, &output_cmp);
const float *output_data = output->data<float>();
for (int i = 0; i < output->numel(); ++i) {
float gap = output_data[i] - output_cmp_data[i];
if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
<< ", output_cmp_data[" << i
<< "] = " << output_cmp_data[i];
delete op;
exit(1);
} }
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::SoftmaxOp<paddle_mobile::CPU, float>>
executor(program, "softmax");
paddle_mobile::framework::Tensor input;
SetupTensor<float>(&input, {1, 1000}, static_cast<float>(0),
static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
auto output =
executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
auto *output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) {
DLOG << " value of output: " << output_ptr[j];
} }
delete op;
return 0;
}
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
TestSoftmaxOp({128, 1000});
TestSoftmaxOp({128, 10, 1000});
return 0; return 0;
} }
...@@ -5,7 +5,7 @@ TOTAL_ERRORS=0 ...@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "paddle_mobile_jni.cpp"); do grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
cpplint $file; cpplint $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done done
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册