提交 f9616068 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1352 from hjchen2/ocr_ctc

Refactor predict api, optimize softmax, add top_k and cast operators
...@@ -22,6 +22,8 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm"; ...@@ -22,6 +22,8 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const char *G_OP_TYPE_BOX_CODER = "box_coder"; const char *G_OP_TYPE_BOX_CODER = "box_coder";
const char *G_OP_TYPE_CONCAT = "concat"; const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_ELEMENTWISE_SUB = "elementwise_sub";
const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant"; const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
...@@ -67,8 +69,9 @@ const char *G_OP_TYPE_CRF = "crf_decoding"; ...@@ -67,8 +69,9 @@ const char *G_OP_TYPE_CRF = "crf_decoding";
const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp"; const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
const char *G_OP_TYPE_FLATTEN = "flatten"; const char *G_OP_TYPE_FLATTEN = "flatten";
const char *G_OP_TYPE_SHAPE = "shape"; const char *G_OP_TYPE_SHAPE = "shape";
const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_TOP_K = "top_k";
const char *G_OP_TYPE_CAST = "cast";
const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
...@@ -100,6 +103,8 @@ std::unordered_map< ...@@ -100,6 +103,8 @@ std::unordered_map<
{G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}}, {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}}, {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_SUB, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}}, {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}}, {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_LRN, {{"X"}, {"Out"}}}, {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
...@@ -142,7 +147,8 @@ std::unordered_map< ...@@ -142,7 +147,8 @@ std::unordered_map<
{G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}}, {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
{G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}, {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
{G_OP_TYPE_SUM, {{"X"}, {"Out"}}}, {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_TOP_K, {{"X"}, {"Out", "Indices"}}},
{G_OP_TYPE_CAST, {{"X"}, {"Out"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
......
...@@ -112,6 +112,8 @@ extern const char *G_OP_TYPE_BATCHNORM; ...@@ -112,6 +112,8 @@ extern const char *G_OP_TYPE_BATCHNORM;
extern const char *G_OP_TYPE_BOX_CODER; extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT; extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD; extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_ELEMENTWISE_SUB;
extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
...@@ -149,7 +151,8 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN; ...@@ -149,7 +151,8 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN;
extern const char *G_OP_TYPE_CONV_TRANSPOSE; extern const char *G_OP_TYPE_CONV_TRANSPOSE;
extern const char *G_OP_TYPE_PRELU; extern const char *G_OP_TYPE_PRELU;
extern const char *G_OP_TYPE_SUM; extern const char *G_OP_TYPE_SUM;
extern const char *G_OP_TYPE_ELEMENTWISE_MUL; extern const char *G_OP_TYPE_TOP_K;
extern const char *G_OP_TYPE_CAST;
extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE;
......
...@@ -28,6 +28,10 @@ extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType( ...@@ -28,6 +28,10 @@ extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
extern std::type_index ToTypeIndex( extern std::type_index ToTypeIndex(
_PaddleMobile__Framework__Proto__VarType__Type type); _PaddleMobile__Framework__Proto__VarType__Type type);
inline _PaddleMobile__Framework__Proto__VarType__Type ToDataType(int type) {
return static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(type);
}
template <typename Visitor> template <typename Visitor>
inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type, inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
Visitor visitor) { Visitor visitor) {
......
此差异已折叠。
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "common/types.h" #include "common/types.h"
#include "common/util.h" #include "common/util.h"
...@@ -28,41 +29,29 @@ limitations under the License. */ ...@@ -28,41 +29,29 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Device, typename T = float>
class Executor { class Executor {
public: public:
typedef typename PrecisionTrait<P>::ptype Ptype; Executor(const Program<Device> &program, int batch_size = 1,
// exector constructor const bool use_optimize = true, const bool lod_mode = false);
// @param program program converted from proto program in PaddlePaddle
// @param use_optimize bool whether use operator fusion to speed up or not PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
// @param loddable bool PMStatus Predict(
Executor(const framework::Program<Dtype> program, int batch_size = 1, const std::vector<std::pair<std::string, LoDTensor>> &inputs);
const bool use_optimize = true, const bool loddable = false);
std::vector<T> Predict(const std::vector<T> &input,
// predict with tensor input const std::vector<int64_t> &dims);
// @param t input tensor to do prediction PMStatus Predict();
// @return predicted tensor
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t); void SetInput(const Tensor &input, const std::string &var_name);
void SetInput(const LoDTensor &input, const std::string &var_name);
// predict with lod tensor input
// @param t input lod tensor to do prediction std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
// @return predicted lod tensor
std::shared_ptr<framework::LoDTensor> PredictLod(
const framework::LoDTensor &t);
// predict with vector input and dims
// @param input vector whose elements will be formed
// @param input lod tensor to do prediction
// @param dims vector whose elements will be formed
// @param input tensor shape
// @return vector which is flatted from predicted tensor
std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims);
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name); void InjectVariable(const Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t); void FeedData(const Tensor &t);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1); std::shared_ptr<Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1); void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start); void Predict_From(int start);
void Predict_To(int end); void Predict_To(int end);
...@@ -70,26 +59,28 @@ class Executor { ...@@ -70,26 +59,28 @@ class Executor {
protected: protected:
Executor() = default; Executor() = default;
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
int block_id); bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc, LoDTensor *tensor) const;
framework::Variable *var,
framework::LoDTensor *tensor) const;
void InitMemory(); void InitMemory();
void InitCombineMemory(); void InitCombineMemory();
void LoadMemory(void **data, void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
const std::shared_ptr<framework::VarDesc> var_desc, LoDTensor *tensor);
framework::LoDTensor *tensor);
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
void LoadMemory(const framework::VarDesc var_desc, float *tensorInput, void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data);
char **data);
#endif #endif
framework::Program<Dtype> program_;
int batch_size_ = 1; int batch_size_;
std::shared_ptr<framework::ProgramDesc> to_predict_program_; bool use_optimize_;
std::map<framework::BlockDesc, bool lod_mode_;
std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>> Program<Device> program_;
ops_of_block_; std::shared_ptr<ProgramDesc> program_desc_;
typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
// operators list
std::vector<OperatorBasePtr> ops_list_;
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct ProfInfo { struct ProfInfo {
int tid = 0; int tid = 0;
...@@ -97,8 +88,6 @@ class Executor { ...@@ -97,8 +88,6 @@ class Executor {
uint64_t runEnd = 0UL; uint64_t runEnd = 0UL;
}; };
#endif #endif
bool use_optimize_ = false;
bool loddable_ = false;
}; };
} // namespace framework } // namespace framework
......
...@@ -228,6 +228,12 @@ LOAD_FUSION_MATCHER(fusion_conv_bn); ...@@ -228,6 +228,12 @@ LOAD_FUSION_MATCHER(fusion_conv_bn);
#ifdef ELEMENTWISESUB_OP #ifdef ELEMENTWISESUB_OP
LOAD_OP1(elementwise_sub, CPU) LOAD_OP1(elementwise_sub, CPU)
#endif #endif
#ifdef TOP_K_OP
LOAD_OP1(top_k, CPU)
#endif
#ifdef CAST_OP
LOAD_OP1(cast, CPU)
#endif
#ifdef QUANT_OP #ifdef QUANT_OP
LOAD_OP1(quantize, CPU); LOAD_OP1(quantize, CPU);
#endif #endif
......
...@@ -23,14 +23,8 @@ limitations under the License. */ ...@@ -23,14 +23,8 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
/** template <typename Device, typename T>
* muteandresize tensor as originProgramDesc and scope in loadParams void Loader<Device, T>::InitMemoryFromProgram(
*
* @param originProgramDesc
* @param scope
*/
template <typename Dtype, Precision P>
void Loader<Dtype, P>::InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc, const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope) { const std::shared_ptr<Scope> &scope) {
for (const auto &block : originProgramDesc.get()->Blocks()) { for (const auto &block : originProgramDesc.get()->Blocks()) {
...@@ -43,8 +37,6 @@ void Loader<Dtype, P>::InitMemoryFromProgram( ...@@ -43,8 +37,6 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
tensor->Resize(make_ddim(dim)); tensor->Resize(make_ddim(dim));
} else { } else {
auto dim = var_desc->Tensor_desc().Dims(); auto dim = var_desc->Tensor_desc().Dims();
// PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
// dim[0] = 1;
if (dim.size() == 0) { if (dim.size() == 0) {
auto tensor = var->GetMutable<LoDTensor>(); auto tensor = var->GetMutable<LoDTensor>();
framework::DDim dDim = {0}; framework::DDim dDim = {0};
...@@ -60,7 +52,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram( ...@@ -60,7 +52,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
} }
} }
} else { } else {
// TODO(codeWorm): some. // TODO(codeWorm)
} }
} }
} }
...@@ -68,7 +60,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram( ...@@ -68,7 +60,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
template <> template <>
void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram( void Loader<GPU_CL, float>::InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc, const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope) { const std::shared_ptr<Scope> &scope) {
for (const auto &block : originProgramDesc.get()->Blocks()) { for (const auto &block : originProgramDesc.get()->Blocks()) {
...@@ -77,7 +69,6 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram( ...@@ -77,7 +69,6 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Persistable()) { if (var_desc->Persistable()) {
auto dim = var_desc->Tensor_desc().Dims(); auto dim = var_desc->Tensor_desc().Dims();
// auto tensor = var->GetMutable<LoDTensor>();
auto cl_image = var->GetMutable<framework::CLImage>(); auto cl_image = var->GetMutable<framework::CLImage>();
cl_image->Resize(make_ddim(dim)); cl_image->Resize(make_ddim(dim));
} else { } else {
...@@ -88,14 +79,13 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram( ...@@ -88,14 +79,13 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
cl_image->Resize(make_ddim(dim)); cl_image->Resize(make_ddim(dim));
} }
} else { } else {
// TODO(codeWorm): some. // TODO(codeWorm)
} }
} }
} }
} }
template <> template <>
const Program<GPU_CL, Precision::FP32> const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
size_t read_size, const uint8_t *buf, size_t combined_params_len, size_t read_size, const uint8_t *buf, size_t combined_params_len,
uint8_t *combined_params_buf, bool optimize, bool quantification) { uint8_t *combined_params_buf, bool optimize, bool quantification) {
bool can_add_split = false; bool can_add_split = false;
...@@ -113,7 +103,7 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory( ...@@ -113,7 +103,7 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program); auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
Program<GPU_CL, Precision::FP32> program; Program<GPU_CL, float> program;
program.combined = true; program.combined = true;
program.originProgram = originProgramDesc; program.originProgram = originProgramDesc;
program.quantification = quantification; program.quantification = quantification;
...@@ -145,16 +135,16 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory( ...@@ -145,16 +135,16 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
/** /**
* fusion and print someinfos * fusion and print someinfos
* @tparam Dtype * @tparam Device
* @tparam P * @tparam P
* @param optimize * @param optimize
* @param can_add_split * @param can_add_split
* @param program * @param program
* @param originProgramDesc * @param originProgramDesc
*/ */
template <typename Dtype, Precision P> template <typename Device, typename T>
void FusionAndPrintInfos( void FusionAndPrintInfos(
bool optimize, bool can_add_split, Program<Dtype, P> *program, bool optimize, bool can_add_split, Program<Device, T> *program,
const std::shared_ptr<ProgramDesc> &originProgramDesc) { const std::shared_ptr<ProgramDesc> &originProgramDesc) {
if (optimize) { if (optimize) {
ProgramOptimize program_optimize; ProgramOptimize program_optimize;
...@@ -193,22 +183,22 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { ...@@ -193,22 +183,22 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
return cur_len; return cur_len;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname, const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
bool optimize, bool optimize,
bool quantification, bool quantification,
bool can_add_split) { bool can_add_split) {
auto program = this->LoadProgram(dirname + "/__model__", optimize, auto program = this->LoadProgram(dirname + "/__model__", optimize,
quantification, can_add_split); quantification, can_add_split);
program.model_path = dirname; program.model_path = dirname;
return program; return program;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path, const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
const std::string &para_path, const std::string &para_path,
bool optimize, bool optimize,
bool quantification) { bool quantification) {
auto program = this->LoadProgram(model_path, optimize, quantification); auto program = this->LoadProgram(model_path, optimize, quantification);
program.para_path = para_path; program.para_path = para_path;
...@@ -217,8 +207,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path, ...@@ -217,8 +207,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
return program; return program;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
const Program<Dtype, P> Loader<Dtype, P>::LoadProgram( const Program<Device, T> Loader<Device, T>::LoadProgram(
const std::string &model_path, bool optimize, bool quantification, const std::string &model_path, bool optimize, bool quantification,
bool can_add_split) { bool can_add_split) {
std::string model_filename = model_path; std::string model_filename = model_path;
...@@ -237,7 +227,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram( ...@@ -237,7 +227,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
// //
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program); auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
Program<Dtype, P> program; Program<Device, T> program;
program.originProgram = originProgramDesc; program.originProgram = originProgramDesc;
program.quantification = quantification; program.quantification = quantification;
program.combined_params_len = 0; program.combined_params_len = 0;
...@@ -254,8 +244,8 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram( ...@@ -254,8 +244,8 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
return program; return program;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory( const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
size_t read_size, const uint8_t *buf, size_t combined_params_len, size_t read_size, const uint8_t *buf, size_t combined_params_len,
uint8_t *combined_params_buf, bool optimize, bool quantification) { uint8_t *combined_params_buf, bool optimize, bool quantification) {
bool can_add_split = false; bool can_add_split = false;
...@@ -273,7 +263,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory( ...@@ -273,7 +263,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program); auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
Program<Dtype, P> program; Program<Device, T> program;
program.combined = true; program.combined = true;
program.originProgram = originProgramDesc; program.originProgram = originProgramDesc;
program.quantification = quantification; program.quantification = quantification;
...@@ -289,13 +279,13 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory( ...@@ -289,13 +279,13 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
return program; return program;
} }
template class Loader<CPU, Precision::FP32>; template class Loader<CPU, float>;
template class Loader<FPGA, Precision::FP32>; template class Loader<FPGA, float>;
template class Loader<GPU_MALI, Precision::FP32>; template class Loader<GPU_MALI, float>;
template class Loader<GPU_CL, Precision::FP32>; template class Loader<GPU_CL, float>;
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -22,39 +22,39 @@ limitations under the License. */ ...@@ -22,39 +22,39 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Device = CPU, typename T = float>
class Loader { class Loader {
public: public:
/* /*
* @b load separate format fluid model * @b load separate format fluid model
* @b 加载分开形式的 fluid 模型 * @b 加载分开存储的fluid模型
* */ * */
const Program<Dtype, P> Load(const std::string &dirname, const Program<Device, T> Load(const std::string &dirname,
bool optimize = false, bool optimize = false,
bool quantification = false, bool quantification = false,
bool can_add_split = false); bool can_add_split = false);
/* /*
* @b load combine format fluid mode * @b load combine format fluid mode
* @b 加载结合在一起格式的模型 * @b 加载统一存储的fluid模型
* */ * */
const Program<Dtype, P> Load(const std::string &model_path, const Program<Device, T> Load(const std::string &model_path,
const std::string &para_path, const std::string &para_path,
bool optimize = false, bool optimize = false,
bool quantification = false); bool quantification = false);
const Program<Dtype, P> LoadCombinedMemory(size_t model_len, const Program<Device, T> LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf, const uint8_t *model_buf,
size_t combined_params_len, size_t combined_params_len,
uint8_t *combined_params_buf, uint8_t *combined_params_buf,
bool optimize = false, bool optimize = false,
bool quantification = false); bool quantification = false);
private: private:
const Program<Dtype, P> LoadProgram(const std::string &model_path, const Program<Device, T> LoadProgram(const std::string &model_path,
bool optimize = false, bool optimize = false,
bool quantification = false, bool quantification = false,
bool can_add_split = false); bool can_add_split = false);
void InitMemoryFromProgram( void InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc, const std::shared_ptr<ProgramDesc> &originProgramDesc,
......
...@@ -16,12 +16,12 @@ limitations under the License. */ ...@@ -16,12 +16,12 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "tensor.h" #include "framework/tensor.h"
#include "tensor_util.h" #include "framework/tensor_util.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
/* /*
...@@ -202,5 +202,29 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor); ...@@ -202,5 +202,29 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
void DeserializeFromStream(std::istream &is, LoDTensor *tensor); void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
#ifdef PADDLE_MOBILE_DEBUG
inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
printer << " dims: " << tensor.dims() << "\n";
int stride = tensor.numel() / 20;
stride = stride > 0 ? stride : 1;
#ifndef PADDLE_MOBILE_FPGA
for (int i = 0; i < tensor.numel(); i += stride) {
if (tensor.type() == typeid(float)) {
printer << tensor.data<float>()[i] << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
} else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) {
printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
}
}
#endif // PADDLE_MOBILE_FPGA
return printer;
}
#endif // PADDLE_MOBILE_DEBUG
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,16 +14,15 @@ limitations under the License. */ ...@@ -14,16 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "common/types.h" #include "common/types.h"
#include "framework/program/program_desc.h" #include "framework/program/program_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include <string>
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
template <typename Dtype, Precision P = Precision::FP32> template <typename Device, typename T = float>
class Program { class Program {
public: public:
std::shared_ptr<ProgramDesc> originProgram; std::shared_ptr<ProgramDesc> originProgram;
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class Scope { class Scope {
public: public:
Scope() = default; Scope() = default;
......
...@@ -148,8 +148,8 @@ class Tensor : public TensorBase { ...@@ -148,8 +148,8 @@ class Tensor : public TensorBase {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value || (std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()), holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s", "Tensor holds the wrong type, it holds %s, requested %s",
this->holder_->type().name()); this->holder_->type().name(), typeid(T).name());
return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
...@@ -162,7 +162,7 @@ class Tensor : public TensorBase { ...@@ -162,7 +162,7 @@ class Tensor : public TensorBase {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value || (std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()), holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s ,requested:%s", "Tensor holds the wrong type, it holds %s, requested %s",
this->holder_->type().name(), typeid(T).name()); this->holder_->type().name(), typeid(T).name());
return reinterpret_cast<const T *>( return reinterpret_cast<const T *>(
...@@ -226,7 +226,6 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { ...@@ -226,7 +226,6 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
} }
} }
#endif #endif
return printer; return printer;
} }
......
...@@ -18,17 +18,17 @@ ...@@ -18,17 +18,17 @@
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P> template <typename Device, typename T>
PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor( PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(
const PaddleMobileConfig &config) { const PaddleMobileConfig &config) {
PADDLE_MOBILE_ENFORCE(Init(config) == true, PADDLE_MOBILE_ENFORCE(Init(config) == true,
"paddle mobile predictor init failed!"); "paddle mobile predictor init failed!");
config_ = config; config_ = config;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) { bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
paddle_mobile_.reset(new PaddleMobile<Dtype, P>()); paddle_mobile_.reset(new PaddleMobile<Device, T>());
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
paddle_mobile_->SetCLPath(config.cl_path); paddle_mobile_->SetCLPath(config.cl_path);
#endif #endif
...@@ -52,8 +52,8 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) { ...@@ -52,8 +52,8 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_->SetThreadNum(config.thread_num); paddle_mobile_->SetThreadNum(config.thread_num);
return true; return true;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
bool PaddleMobilePredictor<Dtype, P>::Run( bool PaddleMobilePredictor<Device, T>::Run(
const std::vector<PaddleTensor> &inputs, const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, int batch_size) { std::vector<PaddleTensor> *output_data, int batch_size) {
if (inputs.empty()) { if (inputs.empty()) {
...@@ -78,12 +78,12 @@ bool PaddleMobilePredictor<Dtype, P>::Run( ...@@ -78,12 +78,12 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
framework::Tensor input_tensor; framework::Tensor input_tensor;
input_tensor.Resize(ddim); input_tensor.Resize(ddim);
int input_length = framework::product(ddim); int input_length = framework::product(ddim);
typedef typename PrecisionTrait<P>::ptype PType; auto input_ptr = input_tensor.mutable_data<T>();
auto input_ptr = input_tensor.mutable_data<PType>();
memcpy(input_ptr, static_cast<PType *>(input.data.data()), memcpy(input_ptr, static_cast<T *>(input.data.data()),
input_length * sizeof(PType)); input_length * sizeof(T));
auto output_tensor = paddle_mobile_->Predict(input_tensor); paddle_mobile_->Predict(input_tensor);
auto output_tensor = paddle_mobile_->Fetch();
if (output_data->empty()) { if (output_data->empty()) {
LOG(kLOG_ERROR) << "At least one output should be set with tensors' names."; LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
...@@ -99,18 +99,18 @@ bool PaddleMobilePredictor<Dtype, P>::Run( ...@@ -99,18 +99,18 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
output.shape.push_back(static_cast<int>(d)); output.shape.push_back(static_cast<int>(d));
} }
if (output.data.length() < output_length * sizeof(PType)) { if (output.data.length() < output_length * sizeof(T)) {
output.data.Resize(output_length * sizeof(PType)); output.data.Resize(output_length * sizeof(T));
} }
memcpy(output.data.data(), output_tensor->template data<PType>(), memcpy(output.data.data(), output_tensor->template data<T>(),
output_length * sizeof(PType)); output_length * sizeof(T));
return true; return true;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() { PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
paddle_mobile_->Clear(); paddle_mobile_->Clear();
} }
...@@ -122,13 +122,13 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>( ...@@ -122,13 +122,13 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
std::unique_ptr<PaddlePredictor> x; std::unique_ptr<PaddlePredictor> x;
if (config.precision == PaddleMobileConfig::FP32) { if (config.precision == PaddleMobileConfig::FP32) {
if (config.device == PaddleMobileConfig::kCPU) { if (config.device == PaddleMobileConfig::kCPU) {
x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config)); x.reset(new PaddleMobilePredictor<CPU, float>(config));
} else if (config.device == PaddleMobileConfig::kFPGA) { } else if (config.device == PaddleMobileConfig::kFPGA) {
x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config)); x.reset(new PaddleMobilePredictor<FPGA, float>(config));
} else if (config.device == PaddleMobileConfig::kGPU_MALI) { } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config)); x.reset(new PaddleMobilePredictor<GPU_MALI, float>(config));
} else if (config.device == PaddleMobileConfig::kGPU_CL) { } else if (config.device == PaddleMobileConfig::kGPU_CL) {
x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config)); x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
} else { } else {
LOG(kLOG_ERROR) << "unsupport device type!"; LOG(kLOG_ERROR) << "unsupport device type!";
return nullptr; return nullptr;
......
...@@ -29,7 +29,7 @@ limitations under the License. */ ...@@ -29,7 +29,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Device = CPU, typename T = float>
class PaddleMobilePredictor : public PaddlePredictor { class PaddleMobilePredictor : public PaddlePredictor {
public: public:
PaddleMobilePredictor() = delete; PaddleMobilePredictor() = delete;
...@@ -43,7 +43,7 @@ class PaddleMobilePredictor : public PaddlePredictor { ...@@ -43,7 +43,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
~PaddleMobilePredictor() override; ~PaddleMobilePredictor() override;
private: private:
std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_; std::unique_ptr<PaddleMobile<Device, T>> paddle_mobile_;
bool Init(const PaddleMobileConfig& config); bool Init(const PaddleMobileConfig& config);
PaddleMobileConfig config_; PaddleMobileConfig config_;
......
...@@ -48,7 +48,7 @@ ...@@ -48,7 +48,7 @@
@interface PaddleMobileCPU() @interface PaddleMobileCPU()
{ {
paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_; paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
BOOL loaded_; BOOL loaded_;
} }
@end @end
...@@ -59,7 +59,7 @@ static std::mutex shared_mutex; ...@@ -59,7 +59,7 @@ static std::mutex shared_mutex;
- (instancetype)init { - (instancetype)init {
if (self = [super init]) { if (self = [super init]) {
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>(); pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
} }
return self; return self;
} }
...@@ -220,7 +220,8 @@ static std::mutex shared_mutex; ...@@ -220,7 +220,8 @@ static std::mutex shared_mutex;
memcpy(input_ptr, input, memcpy(input_ptr, input,
numel * sizeof(float)); numel * sizeof(float));
std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor); pam_->Predict(input_tensor);
std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Fetch();
float *output_pointer = new float[output->numel()]; float *output_pointer = new float[output->numel()];
......
...@@ -16,21 +16,23 @@ limitations under the License. */ ...@@ -16,21 +16,23 @@ limitations under the License. */
#include "paddle_mobile_jni.h" #include "paddle_mobile_jni.h"
#include <cmath> #include <cmath>
#include <string>
#include <vector>
#include "common/log.h" #include "common/log.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "io/paddle_mobile.h" #include "io/paddle_mobile.h"
#ifdef ENABLE_EXCEPTION #ifdef ENABLE_EXCEPTION
#include "common/enforce.h" #include "common/enforce.h"
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
namespace paddle_mobile { namespace paddle_mobile {
namespace jni { namespace jni {
using framework::DDim; using framework::DDim;
using framework::Program; using framework::Program;
using framework::Tensor; using framework::Tensor;
...@@ -200,7 +202,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( ...@@ -200,7 +202,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
input_ptr[i] = dataPointer[i]; input_ptr[i] = dataPointer[i];
} }
auto output = getPaddleMobileInstance()->Predict(input); getPaddleMobileInstance()->Predict(input);
auto output = getPaddleMobileInstance()->Fetch();
count = output->numel(); count = output->numel();
result = env->NewFloatArray(count); result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>()); env->SetFloatArrayRegion(result, 0, count, output->data<float>());
...@@ -233,7 +236,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( ...@@ -233,7 +236,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
input_ptr[i] = dataPointer[i]; input_ptr[i] = dataPointer[i];
} }
auto output = getPaddleMobileInstance()->Predict(input); getPaddleMobileInstance()->Predict(input);
auto output = getPaddleMobileInstance()->Fetch();
count = output->numel(); count = output->numel();
result = env->NewFloatArray(count); result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>()); env->SetFloatArrayRegion(result, 0, count, output->data<float>());
...@@ -328,7 +332,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv( ...@@ -328,7 +332,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
input_ptr[i] = matrix[i]; input_ptr[i] = matrix[i];
} }
auto output = getPaddleMobileInstance()->Predict(input); getPaddleMobileInstance()->Predict(input);
auto output = getPaddleMobileInstance()->Fetch();
count = output->numel(); count = output->numel();
result = env->NewFloatArray(count); result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>()); env->SetFloatArrayRegion(result, 0, count, output->data<float>());
...@@ -363,7 +368,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv( ...@@ -363,7 +368,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
input_ptr[i] = matrix[i]; input_ptr[i] = matrix[i];
} }
auto output = getPaddleMobileInstance()->Predict(input); getPaddleMobileInstance()->Predict(input);
auto output = getPaddleMobileInstance()->Fetch();
count = output->numel(); count = output->numel();
result = env->NewFloatArray(count); result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>()); env->SetFloatArrayRegion(result, 0, count, output->data<float>());
...@@ -399,7 +405,8 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) { ...@@ -399,7 +405,8 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
auto *pdata = words.mutable_data<int64_t>(); auto *pdata = words.mutable_data<int64_t>();
size_t n = words.numel() * sizeof(int64_t); size_t n = words.numel() * sizeof(int64_t);
memcpy(pdata, ids.data(), n); memcpy(pdata, ids.data(), n);
auto vec_result = paddle_mobile.PredictLod(words); paddle_mobile.Predict(words);
auto vec_result = paddle_mobile.Fetch();
int count = vec_result->numel(); int count = vec_result->numel();
jlongArray result = NULL; jlongArray result = NULL;
ANDROIDLOGE("predict nlp size %d", count); ANDROIDLOGE("predict nlp size %d", count);
......
...@@ -13,81 +13,81 @@ See the License for the specific language governing permissions and ...@@ -13,81 +13,81 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "io/paddle_mobile.h" #include "io/paddle_mobile.h"
#include <utility>
#include "common/common.h"
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
#include <CL/cl.h> #include <CL/cl.h>
#include "framework/cl/cl_tensor.h" #include "framework/cl/cl_tensor.h"
#endif #endif
#include "common/common.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) { void PaddleMobile<Device, T>::SetThreadNum(int num) {
#ifdef _OPENMP #ifdef _OPENMP
omp_set_num_threads(num); omp_set_num_threads(num);
#endif #endif
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize, PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
bool quantification, int batch_size, bool optimize, bool quantification,
bool loddable) { int batch_size, bool loddable) {
if (loader_.get() == nullptr) { if (loader_.get() == nullptr) {
loader_ = std::make_shared<framework::Loader<Dtype, P>>(); loader_ = std::make_shared<framework::Loader<Device, T>>();
} else { } else {
LOG(kLOG_INFO) << "loader inited"; LOG(kLOG_INFO) << "loader inited";
} }
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Dtype, P>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(dirname, optimize, quantification), batch_size, optimize, loader_->Load(dirname, optimize, quantification), batch_size, optimize,
loddable); loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
return true; return PMSuccess;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
bool PaddleMobile<Dtype, P>::Load(const std::string &model_path, PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
const std::string &para_path, bool optimize, const std::string &para_path,
bool quantification, int batch_size, bool optimize, bool quantification,
bool loddable) { int batch_size, bool loddable) {
if (loader_.get() == nullptr) { if (loader_.get() == nullptr) {
loader_ = std::make_shared<framework::Loader<Dtype, P>>(); loader_ = std::make_shared<framework::Loader<Device, T>>();
} else { } else {
LOG(kLOG_INFO) << "loader inited"; LOG(kLOG_INFO) << "loader inited";
} }
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Dtype, P>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(model_path, para_path, optimize, quantification), loader_->Load(model_path, para_path, optimize, quantification),
batch_size, optimize, loddable); batch_size, optimize, loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
return true; return PMSuccess;
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len, bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf, const uint8_t *model_buf,
size_t combined_params_len, size_t combined_params_len,
uint8_t *combined_params_buf) { uint8_t *combined_params_buf) {
int batch_size = 1; int batch_size = 1;
bool optimise = true; bool optimise = true;
bool quantification = false; bool quantification = false;
if (loader_.get() == nullptr) { if (loader_.get() == nullptr) {
loader_ = std::make_shared<framework::Loader<Dtype, P>>(); loader_ = std::make_shared<framework::Loader<Device, T>>();
} else { } else {
LOG(kLOG_INFO) << "loader inited"; LOG(kLOG_INFO) << "loader inited";
} }
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Dtype, P>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimise, combined_params_buf, optimise,
quantification), quantification),
...@@ -96,38 +96,76 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len, ...@@ -96,38 +96,76 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
return true; return PMSuccess;
}
template <typename Device, typename T>
PMStatus PaddleMobile<Device, T>::Predict(const framework::Tensor &input) {
std::vector<std::pair<std::string, framework::Tensor>> inputs;
inputs.push_back(std::make_pair("feed", input));
return this->Predict(inputs);
} }
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict( template <typename Device, typename T>
const framework::Tensor &t) { PMStatus PaddleMobile<Device, T>::Predict(const framework::LoDTensor &input) {
return executor_->Predict(t); std::vector<std::pair<std::string, framework::LoDTensor>> inputs;
inputs.push_back(std::make_pair("feed", input));
return this->Predict(inputs);
}
template <typename Device, typename T>
PMStatus PaddleMobile<Device, T>::Predict(
const std::vector<std::pair<std::string, framework::Tensor>> &inputs) {
return executor_->Predict(inputs);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::PredictLod( PMStatus PaddleMobile<Device, T>::Predict(
const framework::LoDTensor &t) { const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs) {
return executor_->PredictLod(t); return executor_->Predict(inputs);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
std::vector<typename PaddleMobile<Dtype, P>::Ptype> std::vector<T> PaddleMobile<Device, T>::Predict(
PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input, const std::vector<T> &input, const std::vector<int64_t> &dims) {
const std::vector<int64_t> &dims) {
return executor_->Predict(input, dims); return executor_->Predict(input, dims);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::Clear() { PMStatus PaddleMobile<Device, T>::Predict() {
return executor_->Predict();
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::Feed(const framework::Tensor &input,
const std::string &var_name) {
executor_->SetInput(input, var_name);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::Feed(const framework::LoDTensor &input,
const std::string &var_name) {
executor_->SetInput(input, var_name);
}
typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
template <typename Device, typename T>
LoDTensorPtr PaddleMobile<Device, T>::Fetch(const std::string &var_name) {
return executor_->GetOutput(var_name);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::Clear() {
executor_ = nullptr; executor_ = nullptr;
loader_ = nullptr; loader_ = nullptr;
} }
template <typename Dtype, Precision P>
double PaddleMobile<Dtype, P>::GetPredictTime() {} template <typename Device, typename T>
double PaddleMobile<Device, T>::GetPredictTime() {}
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
template <> template <>
double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { double PaddleMobile<CPU, float>::GetPredictTime() {
int m = 32; int m = 32;
int n = 224 * 224; int n = 224 * 224;
int k = 27; int k = 27;
...@@ -148,7 +186,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { ...@@ -148,7 +186,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
for (int i = 0; i < k * n; ++i) { for (int i = 0; i < k * n; ++i) {
b[i] = t1 + rand() % t2; // NOLINT b[i] = t1 + rand() % t2; // NOLINT
} }
paddle_mobile::operators::math::Gemm gemm;
operators::math::Gemm gemm;
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb, gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
static_cast<float>(0), c, ldc, false, static_cast<float>(0), c, ldc, false,
...@@ -162,57 +201,51 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { ...@@ -162,57 +201,51 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
} }
#endif #endif
template <typename Dtype, Precision P>
PaddleMobile<Dtype, P>::~PaddleMobile() {
executor_ = nullptr;
loader_ = nullptr;
}
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
template <typename Device, typename T>
template <typename Dtype, Precision P> void PaddleMobile<Device, T>::InjectVariable(const framework::Tensor &t,
void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t, std::string var_name) {
std::string var_name) {
executor_->InjectVariable(t, var_name); executor_->InjectVariable(t, var_name);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) { void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t); executor_->FeedData(t);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) { std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
int id) {
return executor_->FetchResult(id); return executor_->FetchResult(id);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) { void PaddleMobile<Device, T>::Predict_From_To(int start, int end) {
executor_->Predict_From_To(start, end); executor_->Predict_From_To(start, end);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::Predict_From(int start) { void PaddleMobile<Device, T>::Predict_From(int start) {
executor_->Predict_From(start); executor_->Predict_From(start);
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::Predict_To(int end) { void PaddleMobile<Device, T>::Predict_To(int end) {
executor_->Predict_To(end); executor_->Predict_To(end);
} }
#endif #endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
static std::mutex lc; static std::mutex lc;
template <typename Dtype, Precision P> template <typename Device, typename T>
void PaddleMobile<Dtype, P>::SetCLPath(std::string path) { void PaddleMobile<Device, T>::SetCLPath(std::string path) {
std::lock_guard<std::mutex> lock(lc); std::lock_guard<std::mutex> lock(lc);
if (framework::CLEngine::Instance()->GetCLPath() == "") { if (framework::CLEngine::Instance()->GetCLPath() == "") {
framework::CLEngine::Instance()->setClPath(path); framework::CLEngine::Instance()->setClPath(path);
} }
} }
template <> template <>
double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { double PaddleMobile<GPU_CL, float>::GetPredictTime() {
cl_int status; cl_int status;
cl_uint nPlatform; cl_uint nPlatform;
clGetPlatformIDs(0, NULL, &nPlatform); clGetPlatformIDs(0, NULL, &nPlatform);
...@@ -410,8 +443,8 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { ...@@ -410,8 +443,8 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
return -1; return -1;
} }
} }
template <typename Dtype, Precision P> template <typename Device, typename T>
int PaddleMobile<Dtype, P>::readText( int PaddleMobile<Device, T>::readText(
const char *kernelPath, const char *kernelPath,
char **pcode) { // 读取文本文件放入 pcode,返回字符串长度 char **pcode) { // 读取文本文件放入 pcode,返回字符串长度
FILE *fp; FILE *fp;
...@@ -440,13 +473,11 @@ int PaddleMobile<Dtype, P>::readText( ...@@ -440,13 +473,11 @@ int PaddleMobile<Dtype, P>::readText(
fclose(fp); fclose(fp);
return size + 1; return size + 1;
} }
#endif #endif
template class PaddleMobile<CPU, Precision::FP32>; template class PaddleMobile<CPU, float>;
template class PaddleMobile<FPGA, Precision::FP32>; template class PaddleMobile<FPGA, float>;
template class PaddleMobile<GPU_MALI, Precision::FP32>; template class PaddleMobile<GPU_MALI, float>;
template class PaddleMobile<GPU_CL, float>;
template class PaddleMobile<GPU_CL, Precision::FP32>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#ifdef _OPENMP #ifdef _OPENMP
#include <omp.h> #include <omp.h>
...@@ -32,43 +33,52 @@ limitations under the License. */ ...@@ -32,43 +33,52 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Device, typename T = float>
class PaddleMobile { class PaddleMobile {
typedef typename PrecisionTrait<P>::ptype Ptype;
public: public:
PaddleMobile() { PaddleMobile() {
#ifndef PADDLE_MOBILE_CL #ifndef PADDLE_MOBILE_CL
bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Dtype>::value; bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
PADDLE_MOBILE_ENFORCE(!is_gpu, PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
"Not Enable GPU in CmakeList but run gpu codes ");
#endif #endif
} }
bool Load(const std::string &dirname, bool optimize = false, ~PaddleMobile() {}
bool quantification = false, int batch_size = 1,
bool loddable = false); PMStatus Load(const std::string &dirname, const bool optimize = false,
const bool quantification = false, const int batch_size = 1,
const bool lod = false);
PMStatus Load(const std::string &model_path, const std::string &para_path,
const bool optimize = false, const bool quantification = false,
const int batch_size = 1, const bool lod = false);
PMStatus Predict(const framework::Tensor &input);
PMStatus Predict(const framework::LoDTensor &input);
bool Load(const std::string &model_path, const std::string &para_path, PMStatus Predict(
bool optimize = false, bool quantification = false, const std::vector<std::pair<std::string, framework::Tensor>> &inputs);
int batch_size = 1, bool loddable = false); PMStatus Predict(
const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs);
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t); std::vector<T> Predict(const std::vector<T> &input,
const std::vector<int64_t> &dims);
PMStatus Predict();
std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t); void Feed(const framework::LoDTensor &input, const std::string &var_name);
void Feed(const framework::Tensor &input, const std::string &var_name);
std::vector<Ptype> Predict(const std::vector<Ptype> &input, typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
const std::vector<int64_t> &dims); LoDTensorPtr Fetch(const std::string &var_name);
LoDTensorPtr Fetch() { return Fetch("fetch"); }
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len, size_t combined_params_len,
uint8_t *combined_params_buf); uint8_t *combined_params_buf);
void SetThreadNum(int num); void SetThreadNum(int count);
void Clear(); void Clear();
double GetPredictTime(); double GetPredictTime();
~PaddleMobile();
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name); void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t); void FeedData(const framework::Tensor &t);
...@@ -79,15 +89,15 @@ class PaddleMobile { ...@@ -79,15 +89,15 @@ class PaddleMobile {
#endif #endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
public: public: // NOLINT
void SetCLPath(std::string cl_path); void SetCLPath(std::string cl_path);
int readText(const char *kernelPath, int readText(const char *kernelPath,
char **pcode); // 读取文本文件放入 pcode,返回字符串长度 char **pcode); // 读取文本文件放入 pcode,返回字符串长度
#endif #endif
private: private:
std::shared_ptr<framework::Loader<Dtype, P>> loader_; std::shared_ptr<framework::Loader<Device, T>> loader_;
std::shared_ptr<framework::Executor<Dtype, P>> executor_; std::shared_ptr<framework::Executor<Device, T>> executor_;
}; };
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,10 +14,12 @@ limitations under the License. */ ...@@ -14,10 +14,12 @@ limitations under the License. */
#include "io/paddle_test_inference_api.h" #include "io/paddle_test_inference_api.h"
#include "io/paddle_mobile.h" #include "io/paddle_mobile.h"
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P>
double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) { template <typename Device, typename T>
PaddleMobile<Dtype, P> paddle_mobile; double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
PaddleMobile<Device, T> paddle_mobile;
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
if (cl_path) { if (cl_path) {
paddle_mobile.SetCLPath(*cl_path); paddle_mobile.SetCLPath(*cl_path);
...@@ -26,10 +28,10 @@ double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) { ...@@ -26,10 +28,10 @@ double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
#endif #endif
return paddle_mobile.GetPredictTime(); return paddle_mobile.GetPredictTime();
} }
template class PaddleTester<CPU, Precision::FP32>; template class PaddleTester<CPU, float>;
template class PaddleTester<FPGA, Precision::FP32>; template class PaddleTester<FPGA, float>;
template class PaddleTester<GPU_MALI, Precision::FP32>; template class PaddleTester<GPU_MALI, float>;
template class PaddleTester<GPU_CL, Precision::FP32>; template class PaddleTester<GPU_CL, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -20,10 +20,13 @@ limitations under the License. */ ...@@ -20,10 +20,13 @@ limitations under the License. */
*/ */
#pragma once #pragma once
#include "common/types.h" #include "common/types.h"
#include "string" #include "string"
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P = Precision::FP32>
template <typename Device, typename T = float>
class PaddleTester { class PaddleTester {
public: public:
double CaculatePredictTime(std::string *cl_path = nullptr); double CaculatePredictTime(std::string *cl_path = nullptr);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CAST_OP
#include "operators/cast_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void CastOp<DeviceType, T>::InferShape() const {
const auto &dims = this->param_.input_->dims();
this->param_.output_->Resize(dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(cast, ops::CastOp);
#endif
#endif // CAST_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CAST_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/kernels.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class CastOp : public framework::OperatorWithKernel<
DeviceType, CastParam<DeviceType>,
operators::CastKernel<DeviceType, T>> {
public:
CastOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
operators::CastKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // CAST_OP
...@@ -33,4 +33,4 @@ namespace ops = paddle_mobile::operators; ...@@ -33,4 +33,4 @@ namespace ops = paddle_mobile::operators;
REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp); REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
#endif #endif
#endif #endif // DEQUANT_OP
...@@ -44,4 +44,4 @@ class DequantizeOp ...@@ -44,4 +44,4 @@ class DequantizeOp
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // DEQUANT_OP
...@@ -25,12 +25,11 @@ limitations under the License. */ ...@@ -25,12 +25,11 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FillConstantOp : public framework::OperatorBase<DeviceType> { class FillConstantOp : public framework::OperatorBase<DeviceType> {
public: public:
FillConstantOp(const string &type, const VariableNameMap &inputs, FillConstantOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
...@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> { ...@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
tensor->Resize(framework::make_ddim(param_.Shape())); tensor->Resize(framework::make_ddim(param_.Shape()));
tensor->mutable_data(framework::ToTypeIndex(data_type)); tensor->mutable_data(framework::ToTypeIndex(data_type));
math::set_constant(tensor, value); math::SetConstant(tensor, value);
} }
void Init() {} void Init() {}
......
...@@ -14,19 +14,15 @@ limitations under the License. */ ...@@ -14,19 +14,15 @@ limitations under the License. */
#ifdef GRU_OP #ifdef GRU_OP
#include "operators/gru_op.h"
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
#include "operators/gru_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void GruOp<Dtype, T>::InferShape() const { void GruOp<Dtype, T>::InferShape() const {
auto lod_size = this->param_.InputInput()->lod().size();
PADDLE_MOBILE_ENFORCE((lod_size == 1),
"Current LoD only supports one dimension.");
auto input_dims = this->param_.InputInput()->dims(); auto input_dims = this->param_.InputInput()->dims();
auto weight_dims = this->param_.InputWeight()->dims(); auto weight_dims = this->param_.InputWeight()->dims();
int input_size = input_dims[1]; int input_size = input_dims[1];
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef IM2SEQUENCE_OP #ifdef IM2SEQUENCE_OP
#include "operators/im2sequence_op.h" #include "operators/im2sequence_op.h"
#include <vector>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -29,20 +30,16 @@ int Im2SequenceOutputSize(int input_size, int kernel, int padding_1, ...@@ -29,20 +30,16 @@ int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
template <typename Dtype, typename T> template <typename Dtype, typename T>
void Im2SequenceOp<Dtype, T>::InferShape() const { void Im2SequenceOp<Dtype, T>::InferShape() const {
auto in_x_dims = this->param_.Input()->dims(); auto in_x_dims = this->param_.Input()->dims();
const std::vector<int> &kernels = this->param_.Kernels(); const std::vector<int> &kernels = this->param_.Kernels();
const std::vector<int> &strides = this->param_.Strides(); const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings(); std::vector<int> paddings = this->param_.Paddings();
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]}); std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i], output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
paddings[i], paddings[i + 2], paddings[i], paddings[i + 2],
strides[i])); strides[i]));
} }
framework::DDim ddim = framework::make_ddim(output_shape); framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim); this->param_.Output()->Resize(ddim);
} }
...@@ -54,9 +51,5 @@ namespace ops = paddle_mobile::operators; ...@@ -54,9 +51,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp); REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif // IM2SEQUENCE_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CAST_OP
#include <algorithm>
#include <vector>
#include "framework/data_type.h"
#include "operators/kernel/kernels.h"
namespace paddle_mobile {
namespace operators {
template <typename InT>
struct CastOutOpFunctor {
const framework::Tensor* in_;
framework::Tensor* out_;
CastOutOpFunctor(const framework::Tensor* in, framework::Tensor* out)
: in_(in), out_(out) {}
template <typename OutT>
void apply() const {
const InT* input = in_->data<InT>();
OutT* output = out_->mutable_data<OutT>();
size_t numel = in_->numel();
for (int i = 0; i < numel; ++i) {
output[i] = static_cast<OutT>(input[i]);
}
}
};
struct CastOpFunctor {
const framework::Tensor* in_;
framework::Tensor* out_;
int output_type_;
CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
const int output_type)
: in_(in), out_(out), output_type_(output_type) {}
template <typename InT>
void apply() const {
framework::VisitDataType(framework::ToDataType(output_type_),
CastOutOpFunctor<InT>(in_, out_));
}
};
template <>
bool CastKernel<CPU, float>::Init(CastParam<CPU>* param) {
return true;
}
template <>
void CastKernel<CPU, float>::Compute(const CastParam<CPU>& param) {
const Tensor* input = param.input_;
Tensor* output = param.output_;
framework::VisitDataType(framework::ToDataType(param.input_type_),
CastOpFunctor(input, output, param.output_type_));
}
} // namespace operators
} // namespace paddle_mobile
#endif // CAST_OP
...@@ -55,10 +55,9 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) { ...@@ -55,10 +55,9 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
param->Input()->dims()[2] <= 140 /* refered from ncnn */) { param->Input()->dims()[2] <= 140 /* refered from ncnn */) {
param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT; param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
// transform weight // transform weight
framework::Tensor transformed_weight; param->transformed_filter_ = new framework::Tensor;
operators::math::winograd_transform_weight<8, 3>(*param->Filter(), operators::math::winograd_transform_weight<8, 3>(
&transformed_weight); *param->Filter(), param->transformed_filter_);
framework::TensorCopy(transformed_weight, param->Filter());
#endif #endif
} else { } else {
param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT; param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
......
...@@ -30,8 +30,8 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) { ...@@ -30,8 +30,8 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
template <> template <>
void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) { void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
const Tensor *input = param.input_; const LoDTensor *input = param.input_;
Tensor *output = param.output_; LoDTensor *output = param.output_;
float activation_scale = param.activation_scale_->data<float>()[0]; float activation_scale = param.activation_scale_->data<float>()[0];
float weight_scale = param.weight_scale_; float weight_scale = param.weight_scale_;
const int32_t *x = input->data<const int32_t>(); const int32_t *x = input->data<const int32_t>();
...@@ -72,6 +72,7 @@ void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) { ...@@ -72,6 +72,7 @@ void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
y[i] = x[i] * scale; y[i] = x[i] * scale;
} }
output->set_lod(input->lod());
} }
} // namespace operators } // namespace operators
......
...@@ -29,12 +29,6 @@ template <> ...@@ -29,12 +29,6 @@ template <>
void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) { void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
GruCompute<float>(param); GruCompute<float>(param);
param.OutHidden()->set_lod(param.InputInput()->lod()); param.OutHidden()->set_lod(param.InputInput()->lod());
// DLOG << "________________" << param.OutHidden()->dims();
// DLOG << "________________" << param.OutHidden()->numel();
// auto *hiden_data = param.OutHidden()->data<float>();
// for (int64_t i = 0; i < 10; i++) {
// DLOG << "****************" << hiden_data[i];
// }
} }
template class GruKernel<CPU, float>; template class GruKernel<CPU, float>;
......
...@@ -186,8 +186,8 @@ bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) { ...@@ -186,8 +186,8 @@ bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
template <> template <>
void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) { void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
const Tensor *input = param.input_; const LoDTensor *input = param.input_;
Tensor *output = param.output_; LoDTensor *output = param.output_;
Tensor *output_scale = param.online_scale_; Tensor *output_scale = param.online_scale_;
float max_abs = 0.f; float max_abs = 0.f;
if (param.offline_) { if (param.offline_) {
...@@ -212,6 +212,7 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) { ...@@ -212,6 +212,7 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
LOG(kLOG_ERROR) << "round type is not supported."; LOG(kLOG_ERROR) << "round type is not supported.";
break; break;
} }
output->set_lod(input->lod());
} }
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TOP_K_OP
#include <algorithm>
#include <iostream>
#include <vector>
#include "operators/kernel/kernels.h"
namespace paddle_mobile {
namespace operators {
template <>
bool TopKKernel<CPU, float>::Init(TopKParam<CPU> *param) {
return true;
}
template <>
void TopKKernel<CPU, float>::Compute(const TopKParam<CPU> &param) {
const Tensor *input = param.input_;
Tensor *output = param.output_;
Tensor *indices = param.indices_;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int64_t *indices_data = indices->mutable_data<int64_t>();
framework::DDim input_dims = input->dims();
const size_t row = framework::product(
framework::slice_ddim(input_dims, 0, input_dims.size() - 1));
const size_t col = input_dims[input_dims.size() - 1];
#pragma omp parallel for
for (size_t i = 0; i < row; i++) {
std::vector<std::pair<float, size_t>> vec(col);
const float *input_ptr = input_data + i * col;
float *output_ptr = output_data + i * param.k_;
int64_t *indices_ptr = indices_data + i * param.k_;
for (size_t j = 0; j < col; j++) {
vec[j] = std::move(std::pair<float, size_t>(input_ptr[j], j));
}
std::partial_sort(
vec.begin(), vec.begin() + param.k_, vec.end(),
[](const std::pair<float, size_t> &l,
const std::pair<float, size_t> &r) { return l.first > r.first; });
for (int j = 0; j < param.k_; ++j) {
output_ptr[j] = vec[j].first;
indices_ptr[j] = static_cast<int64_t>(vec[j].second);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // TOP_K_OP
...@@ -18,283 +18,63 @@ limitations under the License. */ ...@@ -18,283 +18,63 @@ limitations under the License. */
#include <cmath> #include <cmath>
#include "operators/op_param.h" #include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif // __ARM_NEON__
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename P> template <typename P>
void BatchnormCompute(const BatchNormParam<CPU> &param) { void BatchnormCompute(const BatchNormParam<CPU> &param) {
const Tensor *input_x = param.InputX();
auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int stride0 = C * H * W;
const int stride1 = H * W;
const int stride2 = W;
Tensor *out = param.OutputY();
auto out_ptr = out->mutable_data<float>();
const float epsilon = param.Epsilon(); const float epsilon = param.Epsilon();
const Tensor *mean = param.InputMean(); const float *mean_ptr = param.InputMean()->data<float>();
const Tensor *variance = param.InputVariance(); const float *variance_ptr = param.InputVariance()->data<float>();
const Tensor *scale = param.InputScale(); const float *scale_ptr = param.InputScale()->data<float>();
const Tensor *bias = param.InputBias(); const float *bias_ptr = param.InputBias()->data<float>();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>(); const framework::Tensor *input = param.InputX();
auto scale_ptr = scale->data<float>(); const float *input_ptr = input->data<float>();
auto bias_ptr = bias->data<float>(); framework::Tensor *output = param.OutputY();
float *output_ptr = output->mutable_data<float>();
// Tensor inv_std; size_t spatial_size = output->dims()[2] * output->dims()[3];
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C})); int channels = output->dims()[1];
PADDLE_MOBILE_ENFORCE(C == variance->numel(), #pragma omp parallel for collapse(2)
"C must equal to variance.numel()"); for (int batch = 0; batch < output->dims()[0]; ++batch) {
for (int c = 0; c < channels; ++c) {
int HXW = H * W; float inv_scale = 1.f / (std::sqrt(variance_ptr[c] + epsilon));
float bias = bias_ptr[c] - inv_scale * scale_ptr[c] * mean_ptr[c];
#if __ARM_NEON float scale = inv_scale * scale_ptr[c];
#if __aarch64__ size_t offset = (batch * channels + c) * spatial_size;
float *inv_std_ptr = new float[C]; const float *x = input_ptr + offset;
for (int i = 0; i < C; i++) { float *y = output_ptr + offset;
inv_std_ptr[i] = size_t remain = spatial_size;
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5)); #if defined(__ARM_NEON__) || defined(__ARM_NEON)
} int loop = spatial_size >> 4;
remain = spatial_size & 0xF;
Tensor new_scale; float32x4_t __scale = vdupq_n_f32(scale);
auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C})); float32x4_t __bias = vdupq_n_f32(bias);
Tensor new_bias; for (int k = 0; k < loop; ++k, x += 16, y += 16) {
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C})); float32x4_t r0 = vld1q_f32(x);
float32x4_t r1 = vld1q_f32(x + 4);
/// ((x - est_mean) * (inv_var) * scale + bias equal to float32x4_t r2 = vld1q_f32(x + 8);
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) float32x4_t r3 = vld1q_f32(x + 12);
for (int i = 0; i < C; i++) { r0 = vmlaq_f32(__bias, __scale, r0);
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; r1 = vmlaq_f32(__bias, __scale, r1);
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; r2 = vmlaq_f32(__bias, __scale, r2);
{ r3 = vmlaq_f32(__bias, __scale, r3);
for (int n = 0; n < N; n++) { vst1q_f32(y, r0);
for (int h = 0; h < H; h++) { vst1q_f32(y + 4, r1);
int tmp_index = n * stride0 + i * stride1 + h * stride2; vst1q_f32(y + 8, r2);
for (int w = 0; w < W; w++) { vst1q_f32(y + 12, r3);
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
} }
} #endif // __ARM_NEON__
} for (int k = 0; k < remain; ++k) {
delete[] inv_std_ptr; y[k] = scale * x[k] + bias;
#else
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
int index = i / 4;
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
new_bias_ptr[i + 3] = new_bias_ptr[i];
}
for (int j = C * 4; j < NXC * 4; ++j) {
new_scale_ptr[j] = new_scale_ptr[j - C * 4];
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
"loop_n_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"blt end_c_%= \n\t"
"loop_c_%=: \n\t"
"vld1.32 {q9}, [%[new_scale_ptr]]! \n\t"
"vld1.32 {q10}, [%[new_bias_ptr]]! \n\t"
"mov r6, %[HXW] \n\t"
"subs r6, r6, #32 \n\t"
"blt end_hw_%= \n\t"
"loop_hw_%=: \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"subs r6, r6, #32 \n\t"
"bge loop_hw_%= \n\t"
"end_hw_%=: \n\t"
"cmp r6, #0 \n\t"
"bge end_remainder_%= \n\t"
"mov r5, #4 \n\t"
"mul r6, r6, r5 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r6 \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"add %[out_ptr], %[out_ptr], r6 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"end_remainder_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"bge loop_c_%= \n\t"
"end_c_%=: \n\t"
"subs %[N], %[N], #1 \n\t"
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr =
new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
}
#endif
#else
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
} }
} }
} }
delete[] inv_std_ptr;
#endif
} }
} // namespace operators } // namespace operators
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP #ifdef FUSION_CONVADDADDPRELU_OP
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) { ...@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step); Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
float *biase_data1 = bias1_slice.data<float>(); float *biase_data1 = bias1_slice.data<float>();
// int n = bias1_slice.dims()[0]; math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
// int m = bias1_slice.dims()[1];
// for(int i=0;i<n*m;i++){
// if(biase_data1[i]!=0)
// DLOG<<biase_data1[i]<<",yangfei";
// }
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, biase_data1); p, mode, biase_data, biase_data1);
} }
} }
...@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) { ...@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // FUSION_CONVADDADDPRELU_OP
...@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) { ...@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float, float>(filter_slice, false, col_matrix, false, math::MatMul<float, float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data); static_cast<float>(1), false, biase_data);
} }
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) { void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
...@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) { ...@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
template <typename P> template <typename P>
void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) { void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
Tensor Bias; Tensor Bias;
...@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) { ...@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP #ifdef FUSION_CONVADDPRELU_OP
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
// DLOG<<"yangfei";
// DLOG<<bias.dims();
int axis = param.Axis(); int axis = param.Axis();
Tensor *output = param.Output(); Tensor *output = param.Output();
float *biase_data = bias.data<float>(); float *biase_data = bias.data<float>();
...@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
// math::matmul<float>(filter_slice, false, col_matrix, math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, nullptr); p, mode, biase_data, nullptr);
} }
} }
...@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) { ...@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // FUSION_CONVADDPRELU_OP
...@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha, math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
&out_slice, beta, true, bias_data); &out_slice, beta, true, bias_data);
} }
} }
......
...@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> &param) { ...@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(0), false, static_cast<float>(0), false,
static_cast<Otype *>(nullptr)); static_cast<Otype *>(nullptr));
...@@ -117,7 +117,7 @@ inline void GemmConv(const ConvParam<CPU> &param) { ...@@ -117,7 +117,7 @@ inline void GemmConv(const ConvParam<CPU> &param) {
template <int tile, int kernel> template <int tile, int kernel>
inline void WinogradConv3x3(const ConvParam<CPU> &param) { inline void WinogradConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
const Tensor *filter = param.Filter(); const Tensor *filter = param.transformed_filter_;
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<float>(); output->mutable_data<float>();
int batch_size = input->dims()[0]; int batch_size = input->dims()[0];
......
...@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) { ...@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(filter_slice, false, col_matrix, false, math::MatMulWithBn(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), true, &new_scale, static_cast<float>(1), true, &new_scale, &new_bias, g,
&new_bias, g, bias_data.data<float>()); bias_data.data<float>());
} }
} }
} }
......
...@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) { ...@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
......
...@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) { ...@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmul<P, P>(filter_slice, true, in_slice, false, math::MatMul<P, P>(filter_slice, true, in_slice, false,
static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0)); static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
if (data_dim == 2U) { if (data_dim == 2U) {
col2im(col, dilations, strides, col2im(col, dilations, strides,
......
...@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) { ...@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>( math::MatMulWithBn(filter_slice, false, col_matrix, false,
filter_slice, false, col_matrix, false, static_cast<float>(1), static_cast<float>(1), &out_slice,
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g); static_cast<float>(0), true, &new_scale, &new_bias, g);
} }
} }
} }
......
...@@ -26,18 +26,12 @@ namespace paddle_mobile { ...@@ -26,18 +26,12 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <typename T> template <typename T>
struct AddFunctor { inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
inline T operator()(T a, T b) const { return a + b; } const framework::Tensor *input_x = param.InputX();
}; const framework::Tensor *input_y = param.InputY();
framework::Tensor *Out = param.Out();
template <typename P>
void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *Out = param.Out();
Out->mutable_data<float>();
int axis = param.Axis(); int axis = param.Axis();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const auto &x_dims = input_x->dims(); const auto &x_dims = input_x->dims();
const auto &y_dims = input_y->dims(); const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions. /// axis = -1 represent the last dimensions.
...@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) { ...@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
const float *bias_data = input_y->data<float>(); const float *bias_data = input_y->data<float>();
const float *input_data = input_x->data<float>(); const float *input_data = input_x->data<float>();
float *output_data = Out->mutable_data<float>(); float *output_data = Out->mutable_data<float>();
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
#pragma omp parallel for
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num; size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset; const float *input = input_data + offset;
const float *bias = bias_data + j; const float bias = bias_data[j];
float *output = output_data + offset; float *output = output_data + offset;
int remain = elementwise_num;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int loop = elementwise_num >> 0x4; int loop = elementwise_num >> 0x4;
int remain = elementwise_num & 0xF; remain = elementwise_num & 0xF;
for (int k = 0; k < loop; ++k) { for (int k = 0; k < loop; ++k) {
float32x4_t rb = vdupq_n_f32(*bias); float32x4_t rb = vdupq_n_f32(bias);
float32x4_t r0 = vld1q_f32(input); float32x4_t r0 = vld1q_f32(input);
float32x4_t r1 = vld1q_f32(input + 4); float32x4_t r1 = vld1q_f32(input + 4);
float32x4_t r2 = vld1q_f32(input + 8); float32x4_t r2 = vld1q_f32(input + 8);
...@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) { ...@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
input += 16; input += 16;
output += 16; output += 16;
} }
#endif
for (int k = 0; k < remain; ++k) { for (int k = 0; k < remain; ++k) {
output[k] = input[k] + *bias; output[k] = input[k] + bias;
} }
} }
} }
#else
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out);
#endif
} }
template class ElementwiseAddKernel<CPU, float>; template class ElementwiseAddKernel<CPU, float>;
......
...@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) { ...@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
for (int i = 0; i < out_dim[0]; i++) { for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes); memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
} }
math::matmul<Itype, Otype>(x_matrix, false, y_matrix, false, math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), static_cast<float>(1), out, static_cast<float>(1),
false); false);
} }
......
...@@ -25,18 +25,16 @@ limitations under the License. */ ...@@ -25,18 +25,16 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using LoDTensor = framework::LoDTensor; template <typename Device, typename T>
using Tensor = framework::Tensor;
template <typename DeviceType, typename T>
inline void ReorderInitState(const framework::Tensor& src, inline void ReorderInitState(const framework::Tensor& src,
std::vector<size_t> index_lod, std::vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceType, T> row_shuffle; math::CopyMatrixRowsFunctor<Device, T> row_shuffle;
dst->mutable_data<T>(src.dims()); dst->mutable_data<T>(src.dims());
row_shuffle(src, index_lod, dst, indexed_src); row_shuffle(src, index_lod, dst, indexed_src);
} }
template <typename P>
template <typename T>
void GruCompute(const GruParam<CPU>& param) { void GruCompute(const GruParam<CPU>& param) {
auto* input = param.InputInput(); auto* input = param.InputInput();
auto* h0 = param.InputH0(); auto* h0 = param.InputH0();
...@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
bool is_reverse = param.IsReverse(); bool is_reverse = param.IsReverse();
math::LoDTensor2BatchFunctor<CPU, float> to_batch; math::LoDTensor2BatchFunctor<CPU, float> to_batch;
to_batch(*input, batch_gate, true, is_reverse); to_batch(*input, batch_gate, true, is_reverse);
// math::ClearTensor<CPU, float> clearTensor;
// clearTensor(batch_gate);
if (bias) { if (bias) {
math::RowwiseAdd<CPU, float> add_bias; math::RowwiseAdd<CPU, float> add_bias;
add_bias(*batch_gate, *bias, batch_gate); add_bias(*batch_gate, *bias, batch_gate);
...@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
gru_value.gate_weight = const_cast<float*>(weight_data); gru_value.gate_weight = const_cast<float*>(weight_data);
gru_value.state_weight = gru_value.state_weight =
const_cast<float*>(weight_data + 2 * frame_size * frame_size); const_cast<float*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; framework::Tensor ordered_h0;
std::vector<size_t> order(batch_gate->lod()[2]); std::vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
...@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend); // BUG framework::Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); framework::Tensor reset_hidden_prev_t =
Tensor hidden_t = batch_hidden->Slice(bstart, bend); batch_reset_hidden_prev->Slice(bstart, bend);
framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.output_value = hidden_t.data<float>(); gru_value.output_value = hidden_t.data<float>();
gru_value.gate_value = gate_t.data<float>(); gru_value.gate_value = gate_t.data<float>();
gru_value.reset_output_value = reset_hidden_prev_t.data<float>(); gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
...@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) { ...@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // GRU_OP
...@@ -19,40 +19,6 @@ limitations under the License. */ ...@@ -19,40 +19,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
// 1、如果x,y维度都是2维,
// x = [[1,2], y = [[5,6],
// [3,4]] [7,8]]
// 运算结果为正常矩阵相乘。结果 out =
// [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
//
// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
// x = [[[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]],
// [[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]]]
// y = [[[1,2]],
// [[3,4]],
// [[5,6]],
// [[7,8]]]
// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘,得到6,
// [x_num_col_dims,xdim.size())部分4相乘,得到4,
// 将Tensor x的dims重写成(6,4)
// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘,得到4,
// [y_num_col_dims,ydim.size())部分1,2相乘,得到2,
// 将Tensor y的dims重写成(4,2)
// 并不影响x,y在内存中的分布。
// x = [[1,2,3,4], y = [[1,2],
// [2,3,4,5], [3,4],
// [3,4,5,6], 矩阵乘法 [5,6],
// [1,2,3,4], [7,8]]
// [2,3,4,5],
// [3,4,5,6]]
// 结果x(6行4列)乘y(4行2列),按1中矩阵相乘,结果out(6行2列)
template <typename P> template <typename P>
void MulCompute(const MulParam<CPU> &param) { void MulCompute(const MulParam<CPU> &param) {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
...@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> &param) {
} }
if (param.InputX()->type() == typeid(int8_t)) { if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>(); out->mutable_data<int32_t>();
math::matmul<int8_t, int32_t>(x_matrix, false, y_matrix, false, math::MatMul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), out,
static_cast<float>(0)); static_cast<float>(0));
} else { } else {
out->mutable_data<float>(); out->mutable_data<float>();
math::matmul<float, float>(x_matrix, false, y_matrix, false, math::MatMul<float, float>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out, static_cast<float>(1), out,
static_cast<float>(0)); static_cast<float>(0));
} }
......
...@@ -294,11 +294,6 @@ void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) { ...@@ -294,11 +294,6 @@ void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
} }
} }
} }
// framework::LoD lod;
// lod.emplace_back(batch_starts);
//
// outs->set_lod(lod);
} }
} // namespace operators } // namespace operators
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
#include "framework/cl/cl_tensor.h" #include "framework/cl/cl_tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -43,8 +44,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -43,8 +44,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
const int Stride2 = out_C * out_H * out_W; const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W; const int Stride1 = out_H * out_W;
const int Stride0 = out_W; const int Stride0 = out_W;
CLTensor input_cl_tensor(this->cl_helper_.CLContext(), framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue()); this->cl_helper_.CLCommandQueue());
input_cl_tensor.Resize(input->dims()); input_cl_tensor.Resize(input->dims());
cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data); cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
......
...@@ -94,27 +94,20 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context, ...@@ -94,27 +94,20 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
} }
// for (int i = 0; i < out->numel(); i++) { math::MatMul<float, float>(x_matrix, false, y_matrix, false,
// DLOG << out_data[i]; static_cast<float>(1), out, static_cast<float>(1),
// } false);
// bias_data的维度和out的维度一致
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1), false);
out_image->InitEmptyImage(context, commandQueue, out->dims()); out_image->InitEmptyImage(context, commandQueue, out->dims());
framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1); framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
DLOG << *out;
delete (input_x); delete (input_x);
delete (input_y); delete (input_y);
delete (input_z); delete (input_z);
delete (out); delete (out);
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
} }
template <> template <>
void FusionFcKernel<GPU_CL, float>::Compute( void FusionFcKernel<GPU_CL, float>::Compute(
const FusionFcParam<GPU_CL> &param) { const FusionFcParam<GPU_CL> &param) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBN_OP #ifdef FUSION_CONVBN_OP
#include "operators/kernel/conv_bn_kernel.h" #include "operators/kernel/conv_bn_kernel.h"
#include <cmath>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP #ifdef FUSION_CONVBNRELU_OP
#include "operators/kernel/conv_bn_relu_kernel.h" #include "operators/kernel/conv_bn_relu_kernel.h"
#include <cmath>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#define DECLARE_KERNEL(KernelClass, KernelParam) \
template <typename DeviceType, typename T> \
class KernelClass \
: public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
public: \
bool Init(KernelParam<DeviceType> *param); \
void Compute(const KernelParam<DeviceType> &param); \
};
#ifdef TOP_K_OP
DECLARE_KERNEL(TopKKernel, TopKParam)
#endif // TOP_K_OP
#ifdef CAST_OP
DECLARE_KERNEL(CastKernel, CastParam)
#endif // CAST_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute( ...@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
for (int i = 0; i < out->numel(); i++) { for (int i = 0; i < out->numel(); i++) {
DLOG << out_data[i]; DLOG << out_data[i];
} }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1)); out, static_cast<float>(1));
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) { // if (out_dim.size() != 2) {
......
...@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) { ...@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0)); out, static_cast<float>(0));
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize(out_dim); out->Resize(out_dim);
......
...@@ -38,7 +38,11 @@ limitations under the License. */ ...@@ -38,7 +38,11 @@ limitations under the License. */
* *
* (this is the zlib license) * (this is the zlib license)
*/ */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#pragma once #pragma once
#include <arm_neon.h> #include <arm_neon.h>
#define c_inv_mant_mask ~0x7f800000u #define c_inv_mant_mask ~0x7f800000u
...@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) { ...@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) { static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
float32x4_t reciprocal = vrecpeq_f32(b); float32x4_t reciprocal = vrecpeq_f32(b);
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return vmulq_f32(a, reciprocal); return vmulq_f32(a, reciprocal);
} }
static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) { static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
// pow(x, m) = exp(m * log(x))
return exp_ps(vmulq_f32(b, log_ps(a))); return exp_ps(vmulq_f32(b, log_ps(a)));
} }
#endif // __ARM_NEON__
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include <cstring>
#include <string> #include <string>
#include "common/enforce.h"
#include "framework/data_type.h" #include "framework/data_type.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
...@@ -35,13 +35,13 @@ struct TensorSetConstant { ...@@ -35,13 +35,13 @@ struct TensorSetConstant {
float value_; float value_;
}; };
void set_constant(framework::Tensor *tensor, float value) { void SetConstant(framework::Tensor *tensor, float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()), framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstant(tensor, value)); TensorSetConstant(tensor, value));
} }
template <> template <>
void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, float *bias) { float beta, bool relu, float *bias) {
...@@ -50,20 +50,19 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -50,20 +50,19 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm; Gemm gemm;
if (trans_a) { if (trans_a) {
framework::Tensor matrix_trans;
int numel = matrix_a.numel(); int numel = matrix_a.numel();
int m = matrix_a.dims()[0]; int m = matrix_a.dims()[0];
int n = matrix_a.dims()[1]; int n = matrix_a.dims()[1];
float *tmp = (float *)(matrix_a.data<float>()); // NOLINT float *tmp = (float *)(matrix_a.data<float>()); // NOLINT
float *a = static_cast<float *>( float *a = matrix_trans.mutable_data<float>(matrix_a.dims());
paddle_mobile::memory::Alloc(sizeof(float) * numel));
int index = 0; int index = 0;
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
...@@ -72,7 +71,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -72,7 +71,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
} }
#ifdef _OPENMP #ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta, gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#else #else
...@@ -92,19 +90,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -92,19 +90,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
} }
} }
template <> void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha,
const framework::Tensor &matrix_b, bool trans_b, framework::Tensor *matrix_out, float beta, bool relu,
float alpha, framework::Tensor *matrix_out, float beta, framework::Tensor *new_scale, framework::Tensor *new_bias,
bool relu, framework::Tensor *new_scale, int group, float *bias) {
framework::Tensor *new_bias, int group, float *bias) {
Gemm gemm; Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -122,7 +119,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -122,7 +119,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
new_bias->data<float>() + group, bias); new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1) { float *bias, float *bias1) {
...@@ -132,7 +129,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -132,7 +129,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -146,7 +143,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -146,7 +143,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, matrix_out->data<float>(), N, matrix_b.data<float>(), N, matrix_out->data<float>(), N,
p, mode, bias, bias1); p, mode, bias, bias1);
#endif #endif
} }
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <cmath>
#include <string> #include <string>
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -22,37 +21,37 @@ namespace paddle_mobile { ...@@ -22,37 +21,37 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
void set_constant(framework::Tensor *tensor, float value); void SetConstant(framework::Tensor *tensor, float value);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void MatMul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu = false, framework::Tensor *matrix_out, float beta, bool relu = false,
Otype *bias = nullptr); Otype *bias = nullptr);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void MatMul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, Otype *bias, framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
bool addOnRow); bool addOnRow);
template <typename T> void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, framework::Tensor *matrix_out, float beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias, framework::Tensor *new_scale, framework::Tensor *new_bias,
int group, T *bias = nullptr); int group, float *bias = nullptr);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1); float *bias, float *bias1);
template <typename DeviceType, typename T>
template <typename Device, typename T>
struct ClearTensor { struct ClearTensor {
void operator()(framework::Tensor *tensor); void operator()(framework::Tensor *tensor);
}; };
template <typename DeviceType, typename T> template <typename Device, typename T>
struct RowwiseAdd { struct RowwiseAdd {
void operator()(const framework::Tensor &input, const framework::Tensor &vec, void operator()(const framework::Tensor &input, const framework::Tensor &vec,
framework::Tensor *output); framework::Tensor *output);
......
...@@ -22,7 +22,7 @@ namespace operators { ...@@ -22,7 +22,7 @@ namespace operators {
namespace math { namespace math {
template <> template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias, float beta, bool relu, int32_t *bias,
...@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix"); "The input and output of MatMul be matrix");
int32_t M = dim_out[0]; int32_t M = dim_out[0];
int32_t N = dim_out[1]; int32_t N = dim_out[1];
...@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
} }
template <> template <>
void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a, void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float alpha, framework::Tensor *matrix_out,
float beta, bool relu, int32_t *bias) { float beta, bool relu, int32_t *bias) {
matmul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha, MatMul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
matrix_out, beta, relu, bias, false); matrix_out, beta, relu, bias, false);
} }
......
...@@ -69,10 +69,10 @@ class LoDTensor2BatchFunctor { ...@@ -69,10 +69,10 @@ class LoDTensor2BatchFunctor {
auto lods = lod_tensor.lod(); auto lods = lod_tensor.lod();
PADDLE_MOBILE_ENFORCE((lods.size() == 1UL), PADDLE_MOBILE_ENFORCE((lods.size() == 1UL),
"Only support one level sequence now."); "Only support 1 level sequence, but %d is given",
lods.size());
const auto& lod = lods[0]; const auto& lod = lods[0];
std::vector<SeqInfo> seq_info; std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
int length = lod[seq_id + 1] - lod[seq_id]; int length = lod[seq_id + 1] - lod[seq_id];
......
...@@ -15,154 +15,131 @@ limitations under the License. */ ...@@ -15,154 +15,131 @@ limitations under the License. */
#ifdef SOFTMAX_OP #ifdef SOFTMAX_OP
#include "operators/math/softmax.h" #include "operators/math/softmax.h"
#include "common/types.h"
#ifdef __ARM_NEON
#include <math.h> #include <math.h>
#include <algorithm> #include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math_func_neon.h" #include "operators/math/math_func_neon.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
using framework::DDim;
using framework::Tensor; #if defined(__ARM_NEON) || defined(__ARM_NEON__)
template <typename T> #ifndef __aarch64__
class SoftmaxFuntor<CPU, T> { inline float32_t vmaxvq_f32(const float32x4_t &r) {
#ifdef __ARM_NEON float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
void sum(float *input, float *sumptr, int inner_size, int outter_size) { return vget_lane_f32(vpmax_f32(v, v), 0);
float32x4_t acc = vdupq_n_f32(0); }
float sum_ = 0;
for (int i = 0; i < outter_size; ++i) { inline float32_t vaddvq_f32(const float32x4_t &r) {
float *input_outer_ptr = input + i * inner_size; float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
int nn = inner_size >> 2; return vget_lane_f32(vpadd_f32(v, v), 0);
int left = inner_size - (nn << 2); }
for (; nn > 0; nn--) { #endif // __aarch64__
float32x4_t vec_input = vld1q_f32(input_outer_ptr); #endif // __ARM_NEON__
acc = vaddq_f32(acc, vec_input);
input_outer_ptr += 4; float find_max(const float *input, const int num_classes) {
} int remain = num_classes;
float32x2_t vsum_ = vadd_f32(vget_high_f32(acc), vget_low_f32(acc)); float max = -std::numeric_limits<float>::max();
sum_ = vget_lane_f32(vsum_, 0) + vget_lane_f32(vsum_, 1); #if defined(__ARM_NEON) || defined(__ARM_NEON__)
for (; left > 0; left--) { int loop = num_classes >> 3;
sum_ += *input_outer_ptr; remain = num_classes & 0x7;
input_outer_ptr++; float32x4_t __max = vdupq_n_f32(max);
} for (int i = 0; i < loop; ++i, input += 8) {
} float32x4_t x0 = vld1q_f32(input);
for (int j = 0; j < inner_size * outter_size; ++j) { float32x4_t x1 = vld1q_f32(input + 4);
sumptr[j] = sum_; __max = vmaxq_f32(x0, __max);
} __max = vmaxq_f32(x1, __max);
}
max = vmaxvq_f32(__max);
#endif
for (int i = 0; i < remain; ++i) {
max = std::max(max, input[i]);
} }
return max;
}
void SoftmaxCacl(const Tensor *X, Tensor *Y) { template <>
const float *input = X->data<float>(); void SoftmaxFuntor<CPU, float>::operator()(const framework::Tensor *X,
const DDim &dDim = X->dims(); framework::Tensor *Y) {
int axis_index = 1; const framework::DDim &dims = X->dims();
if (dDim.size() < 4) { int batch_size = dims[0];
axis_index = 0; int num_classes = dims[dims.size() - 1];
} int channels = X->numel() / batch_size / num_classes;
DDim outer_ddim = const float *x = X->data<float>();
paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1); float *y = Y->mutable_data<float>();
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size()); #pragma omp parallel for collapse(2)
int out_size = paddle_mobile::framework::product(outer_ddim); for (int batch = 0; batch < X->dims()[0]; ++batch) {
int inner_size = paddle_mobile::framework::product(inner_ddim); for (int channel = 0; channel < channels; ++channel) {
auto *max_ptr = new float[inner_size * out_size]; size_t offset = (batch * channels + channel) * num_classes;
// max const float *input = x + offset;
for (int j = 0; j < out_size; ++j) { float *output = y + offset;
const float *input_outer_ptr = input + j * inner_size; // find max
float *max_outer_ptr = max_ptr + j * inner_size; float max = find_max(input, num_classes);
float max_ = 0;
for (int i = 0; i < inner_size; ++i) { // exp(x - max)
const float *input_inner_ptr = input_outer_ptr + i; int remain = num_classes;
max_ = std::max(max_, input_inner_ptr[0]); #if defined(__ARM_NEON) || defined(__ARM_NEON__)
} int loop = num_classes >> 3;
for (int k = 0; k < inner_size; ++k) { remain = num_classes & 0x7;
max_outer_ptr[k] = max_; float32x4_t __max = vdupq_n_f32(max);
for (int i = 0; i < loop; ++i, input += 8, output += 8) {
float32x4_t x0 = vld1q_f32(input);
float32x4_t x1 = vld1q_f32(input + 4);
x0 = vsubq_f32(x0, __max);
x1 = vsubq_f32(x1, __max);
x0 = exp_ps(x0);
x1 = exp_ps(x1);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x1);
} }
} #endif // __ARM_NEON__
// exp(value - max) for (int i = 0; i < remain; ++i) {
float *exp_sub_max = new float[inner_size * out_size]; output[i] = expf(input[i] - max);
float *exp_sub_max_ptr = &exp_sub_max[0];
for (int l = 0; l < out_size; ++l) {
const float *input_outer_ptr = input + l * inner_size;
float *max_outer_ptr = max_ptr + l * inner_size;
int nn = inner_size >> 2;
int left = inner_size - (nn << 2);
for (; nn > 0; nn--) {
float32x4_t vec_input = vld1q_f32(input_outer_ptr);
float32x4_t vec_max = vld1q_f32(max_outer_ptr);
float32x4_t vec_sub = vsubq_f32(vec_input, vec_max);
float32x4_t vec_exp = exp_ps(vec_sub);
vst1q_f32(exp_sub_max_ptr, vec_exp);
input_outer_ptr += 4;
max_outer_ptr += 4;
exp_sub_max_ptr += 4;
} }
for (; left > 0; left--) {
*exp_sub_max_ptr = expf(*input_outer_ptr - *max_outer_ptr);
input_outer_ptr++; // sum(exp(x - max))
max_outer_ptr++; float sum = 0.f;
exp_sub_max_ptr++; output = y + offset;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __sum = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
__sum = vaddq_f32(x0, __sum);
__sum = vaddq_f32(x1, __sum);
} }
} sum += vaddvq_f32(__sum);
float *sumptr = new float[inner_size * out_size]; #endif // __ARM_NEON__
// sum exp for (int i = 0; i < remain; ++i) {
sum(exp_sub_max, sumptr, inner_size, out_size); sum += output[i];
// div
auto *out_ptr = Y->mutable_data<float>();
for (int l = 0; l < out_size; ++l) {
const float *input_outer_ptr = exp_sub_max + l * inner_size;
float *output_outer_ptr = out_ptr + l * inner_size;
float *sum_outer_ptr = sumptr + l * inner_size;
int nn = inner_size >> 2;
int left = inner_size - (nn << 2);
for (; nn > 0; nn--) {
float32x4_t vec_input = vld1q_f32(input_outer_ptr);
float32x4_t vec_sum = vld1q_f32(sum_outer_ptr);
float32x4_t vec_div = div_ps(vec_input, vec_sum);
vst1q_f32(output_outer_ptr, vec_div);
input_outer_ptr += 4;
output_outer_ptr += 4;
sum_outer_ptr += 4;
} }
for (; left > 0; left--) {
*output_outer_ptr = (*input_outer_ptr) / (*sum_outer_ptr); // exp(x - max) / sum
input_outer_ptr++; float inv_sum = 1.f / sum;
output_outer_ptr++; output = y + offset;
sum_outer_ptr++; #if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
x0 = vmulq_f32(x0, __inv_sum);
x1 = vmulq_f32(x1, __inv_sum);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x1);
} }
}
}
#else
#endif // ARM_NEON
public:
void operator()(const framework::Tensor *X, framework::Tensor *Y) {
const DDim dDim = X->dims();
int dim1 = dDim[dDim.size() - 1];
int dim0 = X->numel() / dim1 / dDim[0];
framework::DDim matrix_shape = {dim0, dim1};
for (int i = 0; i < dDim[0]; ++i) {
framework::Tensor sub_X = X->Slice(i, i + 1);
framework::Tensor sub_Y = Y->Slice(i, i + 1);
sub_X.Resize(matrix_shape);
sub_Y.Resize(matrix_shape);
for (int j = 0; j < dim0; j++) {
framework::Tensor sub_x = sub_X.Slice(j, j + 1);
framework::Tensor sub_y = sub_Y.Slice(j, j + 1);
#ifdef __ARM_NEON
SoftmaxCacl(&sub_x, &sub_y);
#endif #endif
for (int i = 0; i < remain; ++i) {
output[i] *= inv_sum;
} }
} }
} }
}; }
template class SoftmaxFuntor<CPU, float>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
#endif // SOFTMAX_OP
...@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP #ifdef SOFTMAX_OP
#pragma once #pragma once
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceType, typename T> template <typename Device, typename T>
class SoftmaxFuntor { class SoftmaxFuntor {
public: public:
void operator()(const framework::Tensor *X, framework::Tensor *Y); void operator()(const framework::Tensor *X, framework::Tensor *Y);
}; };
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -327,8 +327,8 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -327,8 +327,8 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
int channel = input.dims()[1]; int channel = input.dims()[1];
int height = input.dims()[2]; int height = input.dims()[2];
int width = input.dims()[3]; int width = input.dims()[3];
int h_tiles = (height + 3) / 6; // (height - 8 + 5 + 6) / 6 int h_tiles = (height + 3) / 6; // (height - 2 + 5) / 6
int w_tiles = (width + 3) / 6; // (width - 8 + 5 + 6) / 6 int w_tiles = (width + 3) / 6; // (width - 2 + 5) / 6
int tiles = (h_tiles * w_tiles + 7) / 8; int tiles = (h_tiles * w_tiles + 7) / 8;
framework::DDim transformed_shape = framework::DDim transformed_shape =
framework::make_ddim(std::vector<int>{tiles, 64, channel, 8}); framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
...@@ -336,16 +336,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -336,16 +336,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
memset(outptr, 0, output->numel() * sizeof(float)); memset(outptr, 0, output->numel() * sizeof(float));
const float *inptr = input.data<float>(); const float *inptr = input.data<float>();
int inter_h = (height - 2) / 6; height = h_tiles * 6 + 2;
int inter_w = (width - 2) / 6; width = w_tiles * 6 + 2;
int remain_h = height - (inter_h * 6);
int remain_w = width - (inter_w * 6);
framework::Tensor input_pad; framework::Tensor input_pad;
if (remain_h > 2 || remain_w > 2) { if (height > input.dims()[2] || width > input.dims()[3]) {
inter_h += (remain_h > 2);
inter_w += (remain_w > 2);
height = (inter_h - 1) * 6 + 8;
width = (inter_w - 1) * 6 + 8;
framework::DDim input_shape = framework::DDim input_shape =
framework::make_ddim(std::vector<int>{1, channel, height, width}); framework::make_ddim(std::vector<int>{1, channel, height, width});
PadFunctor<CPU, float> pad; PadFunctor<CPU, float> pad;
...@@ -878,8 +872,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -878,8 +872,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
framework::Tensor *output) { framework::Tensor *output) {
// weight shape is [out_channel/4, 64, in_channel, 4], // weight shape is [out_channel/4, 64, in_channel, 4],
// input shape is [hw/8, 64, in_channel, 8] // input shape is [hw/8, 64, in_channel, 8]
int in_channel = input.dims()[2];
int tiles = input.dims()[0]; int tiles = input.dims()[0];
int in_channel = input.dims()[2];
int out_channel = weight.dims()[0]; int out_channel = weight.dims()[0];
// compute U*V first // compute U*V first
...@@ -887,7 +881,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -887,7 +881,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
framework::DDim shape = framework::DDim shape =
framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32}); framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32});
float *uv_trans_ptr = uv_trans.mutable_data<float>(shape); float *uv_trans_ptr = uv_trans.mutable_data<float>(shape);
memset(uv_trans_ptr, 0, uv_trans.numel() * sizeof(float));
const float *input_ptr = input.data<float>(); const float *input_ptr = input.data<float>();
const float *weight_ptr = weight.data<float>(); const float *weight_ptr = weight.data<float>();
...@@ -910,7 +903,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -910,7 +903,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"veor q14, q14, q14 \n" "veor q14, q14, q14 \n"
"veor q15, q15, q15 \n" "veor q15, q15, q15 \n"
"b store_res_%= \n" "cmp %[inter_channel], #0 \n"
"ble loop_1c_%= \n"
// loop 2 channels // loop 2 channels
"loop_2c_%=: \n" "loop_2c_%=: \n"
"vld1.32 {d0-d3}, [%[w_ptr]]! \n" "vld1.32 {d0-d3}, [%[w_ptr]]! \n"
...@@ -936,13 +930,14 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -936,13 +930,14 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"subs %[inter_channel], #1 \n" "subs %[inter_channel], #1 \n"
"bne loop_2c_%= \n" "bne loop_2c_%= \n"
"mov pc, lr \n"
// loop 1 channel // loop 1 channel
"loop_c_%=: \n" "loop_1c_%=: \n"
"cmp %[remain_channel], #0 \n"
"ble store_res_%= \n"
"vld1.32 {d0-d1}, [%[w_ptr]]! \n" "vld1.32 {d0-d1}, [%[w_ptr]]! \n"
"vld1.32 {d4-d7}, [%[in_ptr]]! \n" "vld1.32 {d4-d7}, [%[in_ptr]]! \n"
"vmla.f32 q8, q2, d0[0] \n" "vmla.f32 q8, q2, d0[0] \n"
"vmla.f32 q9, q3, d0[0] \n" "vmla.f32 q9, q3, d0[0] \n"
"vmla.f32 q10, q2, d0[1] \n" "vmla.f32 q10, q2, d0[1] \n"
...@@ -952,28 +947,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -952,28 +947,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"vmla.f32 q14, q2, d1[1] \n" "vmla.f32 q14, q2, d1[1] \n"
"vmla.f32 q15, q3, d1[1] \n" "vmla.f32 q15, q3, d1[1] \n"
"subs %[remain_channel], #1 \n"
"bne loop_c_%= \n"
"mov pc, lr \n"
"store_res_%=: \n" "store_res_%=: \n"
"cmp %[inter_channel], #0 \n"
"it gt \n"
"blgt loop_2c_%= \n"
"cmp %[remain_channel], #0 \n"
"it gt \n"
"blgt loop_c_%= \n"
"vst1.32 {d16-d19}, [%[uv_ptr]]! \n" "vst1.32 {d16-d19}, [%[uv_ptr]]! \n"
"vst1.32 {d20-d23}, [%[uv_ptr]]! \n" "vst1.32 {d20-d23}, [%[uv_ptr]]! \n"
"vst1.32 {d24-d27}, [%[uv_ptr]]! \n" "vst1.32 {d24-d27}, [%[uv_ptr]]! \n"
"vst1.32 {d28-d31}, [%[uv_ptr]]! \n" "vst1.32 {d28-d31}, [%[uv_ptr]]! \n"
: [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr), : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
[remain_channel] "+r"(remain_channel),
[inter_channel] "+r"(inter_channel) [inter_channel] "+r"(inter_channel)
: : [remain_channel] "r"(remain_channel)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "pc", "lr"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
} }
} }
} }
...@@ -1223,8 +1206,10 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -1223,8 +1206,10 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w; size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
float *out_ptr = output_ptr + offset; float *out_ptr = output_ptr + offset;
int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h; int remain_row = out_h - 6 * tile_h;
int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w; int remain_col = out_w - 6 * tile_w;
remain_row = (remain_row > 6) ? 6 : remain_row;
remain_col = (remain_col > 6) ? 6 : remain_col;
for (int i = 0; i < remain_row; ++i, out_ptr += out_w) { for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float)); memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
} }
......
...@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn // We refer https://github.com/andravin/wincnn to access the winograd transform
// project. // matrixs
#ifdef CONV_OP #ifdef CONV_OP
#ifdef __aarch64__ #ifdef __aarch64__
#include "operators/math/pad.h"
#include "operators/math/winograd/winograd_transform.h" #include "operators/math/winograd/winograd_transform.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -29,46 +27,382 @@ namespace math { ...@@ -29,46 +27,382 @@ namespace math {
template <> template <>
void winograd_transform_weight<8, 3>(const framework::Tensor &weight, void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
framework::Tensor *output) { framework::Tensor *output) {
/* // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
* w0 = g0 int out_channel = weight.dims()[0];
* w1 = ((g0 + g2) + g1) * (-2.0 / 9) int in_channel = weight.dims()[1];
* w2 = ((g0 + g2) - g1) * (-2.0 / 9) // reshape and alloc transformed weight
* w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90) framework::DDim transformed_shape =
* w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90) framework::make_ddim(std::vector<int>{out_channel, in_channel, 64});
* w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180) float *outptr = output->mutable_data<float>(transformed_shape);
* w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180) const float *inptr = weight.data<float>();
* w7 = g2 for (int oc = 0; oc < out_channel; ++oc) {
*/ for (int ic = 0; ic < in_channel; ++ic) {
// TODO(hjchen2) size_t offset = oc * in_channel + ic;
PADDLE_MOBILE_THROW_EXCEPTION( float *kout = outptr + offset * 64;
"Winograd for arm v8 has not been implemented."); const float *k = inptr + offset * 9;
float gw[3][8];
for (int i = 0; i < 3; ++i, k += 3) {
float g0 = k[0];
float g1 = k[1];
float g2 = k[2];
float d0 = g0 + g2;
float d1 = g0 + 4 * g2;
float d2 = g2 + 4 * g0;
float d3 = 2 * g1;
gw[i][0] = g0;
gw[i][1] = -2.f / 9 * (d0 + g1); // -2.f/9 * (g0 + g1 + g2)
gw[i][2] = -2.f / 9 * (d0 - g1); // -2.f/9 * (g0 - g1 + g2)
gw[i][3] = 1.f / 90 * (d1 + d3); // 1.f/90 * (g0 + 2 * g1 + 4 * g2)
gw[i][4] = 1.f / 90 * (d1 - d3); // 1.f/90 * (g0 - 2 * g1 + 4 * g2)
gw[i][5] = 1.f / 180 * (d2 + d3); // 1.f/180 * (4 * g0 + 2 * g1 + g2)
gw[i][6] = 1.f / 180 * (d2 - d3); // 1.f/180 * (4 * g0 - 2 * g1 + g2)
gw[i][7] = g2;
}
for (int i = 0; i < 8; ++i, kout += 8) {
float g0 = gw[0][i];
float g1 = gw[1][i];
float g2 = gw[2][i];
float d0 = g0 + g2;
float d1 = g0 + 4 * g2;
float d2 = g2 + 4 * g0;
float d3 = 2 * g1;
kout[0] = g0;
kout[1] = -2.f / 9 * (d0 + g1); // -2.f/9 * (k0 + k1 + k2)
kout[2] = -2.f / 9 * (d0 - g1); // -2.f/9 * (k0 - k1 + k2)
kout[3] = 1.f / 90 * (d1 + d3); // 1.f/90 * (k0 + 2 * k1 + 4 * k2)
kout[4] = 1.f / 90 * (d1 - d3); // 1.f/90 * (k0 - 2 * k1 + 4 * k2)
kout[5] = 1.f / 180 * (d2 + d3); // 8.f/45 * (4 * k0 + 2 * k1 + k2)
kout[6] = 1.f / 180 * (d2 - d3); // 8.f/45 * (4 * k0 - 2 * k1 + k2)
kout[7] = g2;
}
}
}
} }
template <> template <>
void winograd_transform_input<8, 3>(const framework::Tensor &input, void winograd_transform_input<8, 3>(const framework::Tensor &input,
framework::Tensor *output) { framework::Tensor *output) {
/* // tile input to [c, roundup(h/6), roundup(w/6), 64] and do transformation
* x0 = (d0 - d6) + (d4 - d2) * 5.25 int channel = input.dims()[1];
* x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5) int height = input.dims()[2];
* x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5) int width = input.dims()[3];
* x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5) int h_tiles = (height + 3) / 6; // (height + 5 - 2) / 6
* x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5) int w_tiles = (width + 3) / 6; // (width + 5 - 2) / 6
* x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5) framework::DDim transformed_shape =
* x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5) framework::make_ddim(std::vector<int>{channel, h_tiles, w_tiles, 64});
* x7 = (d7 - d1) + (d3 - d5) * 5.25 float *outptr = output->mutable_data<float>(transformed_shape);
*/ memset(outptr, 0, channel * h_tiles * w_tiles * 64 * sizeof(float));
// TODO(hjchen2) const float *inptr = input.data<float>();
PADDLE_MOBILE_THROW_EXCEPTION( // pack input to tiles
"Winograd for arm v8 has not been implemented."); for (int c = 0; c < channel; ++c) {
int inter_h = (height - 2) / 6;
int inter_w = (width - 2) / 6;
int remain_h = height - (inter_h * 6);
int remain_w = width - (inter_w * 6);
const float *in0 = inptr + c * height * width;
const float *in1 = in0 + width;
const float *in2 = in1 + width;
const float *in3 = in2 + width;
const float *in4 = in3 + width;
const float *in5 = in4 + width;
const float *in6 = in5 + width;
const float *in7 = in6 + width;
float *out = outptr + c * h_tiles * w_tiles * 64;
for (int h = 0; h < inter_h; ++h) {
for (int w = 0; w < inter_w; ++w) {
memcpy(out, in0, 8 * sizeof(float));
memcpy(out + 8, in1, 8 * sizeof(float));
memcpy(out + 16, in2, 8 * sizeof(float));
memcpy(out + 24, in3, 8 * sizeof(float));
memcpy(out + 32, in4, 8 * sizeof(float));
memcpy(out + 40, in5, 8 * sizeof(float));
memcpy(out + 48, in6, 8 * sizeof(float));
memcpy(out + 56, in7, 8 * sizeof(float));
in0 += 6;
in1 += 6;
in2 += 6;
in3 += 6;
in4 += 6;
in5 += 6;
in6 += 6;
in7 += 6;
out += 64;
}
// remain width
if (remain_w > 2) {
memcpy(out, in0, remain_w * sizeof(float));
memcpy(out + 8, in1, remain_w * sizeof(float));
memcpy(out + 16, in2, remain_w * sizeof(float));
memcpy(out + 24, in3, remain_w * sizeof(float));
memcpy(out + 32, in4, remain_w * sizeof(float));
memcpy(out + 40, in5, remain_w * sizeof(float));
memcpy(out + 48, in6, remain_w * sizeof(float));
memcpy(out + 56, in7, remain_w * sizeof(float));
out += 64;
}
in0 += 5 * width + remain_w;
in1 += 5 * width + remain_w;
in2 += 5 * width + remain_w;
in3 += 5 * width + remain_w;
in4 += 5 * width + remain_w;
in5 += 5 * width + remain_w;
in6 += 5 * width + remain_w;
in7 += 5 * width + remain_w;
}
// remain height
if (remain_h > 2) {
for (int w = 0; w < inter_w; ++w) {
for (int rh = 0; rh < remain_h; ++rh) {
memcpy(out + rh * 8, in0 + rh * width, 8 * sizeof(float));
}
out += 64;
in0 += 6;
}
// remain width
if (remain_w > 2) {
for (int rh = 0; rh < remain_h; ++rh) {
memcpy(out + rh * 8, in0 + rh * width, remain_w * sizeof(float));
}
}
}
}
// transform tiles, compute B_T * d(c, b) * B
for (int c = 0; c < channel; ++c) {
for (int tile = 0; tile < h_tiles * w_tiles; ++tile) {
float *out = outptr + (c * h_tiles * w_tiles + tile) * 64;
// compute B_T * d(c, b)
float bd[8][8];
for (int i = 0; i < 8; ++i) {
float d0 = out[8 * i + 0];
float d1 = out[8 * i + 1];
float d2 = out[8 * i + 2];
float d3 = out[8 * i + 3];
float d4 = out[8 * i + 4];
float d5 = out[8 * i + 5];
float d6 = out[8 * i + 6];
float d7 = out[8 * i + 7];
bd[i][0] = d0 - d6 + (d4 - d2) * 5.25;
float v1 = d2 - 4.25 * d4 + d6;
float v2 = d1 - 4.25 * d3 + d5;
// d1 + d2 - 4.25 * d3 - 4.25 * d4 + d5 + d6
bd[i][1] = v1 + v2;
// -d1 + d2 + 4.25 * d3 - 4.25 * d4 - d5 + d6
bd[i][2] = v1 - v2;
v1 = 0.25 * d2 - 1.25 * d4 + d6;
v2 = 0.5 * d1 - 2.5 * d3 + 2 * d5;
// 0.5 * d1 + 0.25 * d2 - 2.5 * d3 - 1.25 * d4 + 2 * d5 + d6
bd[i][3] = v1 + v2;
// -0.5 * d1 + 0.25 * d2 + 2.5 * d3 - 1.25 * d4 - 2 * d5 + d6
bd[i][4] = v1 - v2;
v1 = 4 * d2 - 5 * d4 + d6;
v2 = 2 * d1 - 2.5 * d3 + 0.5 * d5;
// 2 * d1 + 4 * d2 - 2.5 * d3 - 5 * d4 + 0.5 * d5 + d6
bd[i][5] = v1 + v2;
// -2 * d1 + 4 * d2 + 2.5 * d3 - 5 * d4 - 0.5 * d5 + d6
bd[i][6] = v1 - v2;
bd[i][7] = d7 - d1 + (d3 - d5) * 5.25;
}
// compute B_T * d(c, b) * B
for (int i = 0; i < 8; ++i, out += 8) {
float d0 = bd[0][i];
float d1 = bd[1][i];
float d2 = bd[2][i];
float d3 = bd[3][i];
float d4 = bd[4][i];
float d5 = bd[5][i];
float d6 = bd[6][i];
float d7 = bd[7][i];
out[0] = d0 - d6 + (d4 - d2) * 5.25;
float v1 = d2 - 4.25 * d4 + d6;
float v2 = d1 - 4.25 * d3 + d5;
// d1 + d2 - 4.25 * d3 - 4.25 * d4 + d5 + d6
out[1] = v1 + v2;
// -d1 + d2 + 4.25 * d3 - 4.25 * d4 - d5 + d6
out[2] = v1 - v2;
v1 = 0.25 * d2 - 1.25 * d4 + d6;
v2 = 0.5 * d1 - 2.5 * d3 + 2 * d5;
// 0.5 * d1 + 0.25 * d2 - 2.5 * d3 - 1.25 * d4 + 2 * d5 + d6
out[3] = v1 + v2;
// -0.5 * d1 + 0.25 * d2 + 2.5 * d3 - 1.25 * d4 - 2 * d5 + d6
out[4] = v1 - v2;
v1 = 4 * d2 - 5 * d4 + d6;
v2 = 2 * d1 - 2.5 * d3 + 0.5 * d5;
// 2 * d1 + 4 * d2 - 2.5 * d3 - 5 * d4 + 0.5 * d5 + d6
out[5] = v1 + v2;
// -2 * d1 + 4 * d2 + 2.5 * d3 - 5 * d4 - 0.5 * d5 + d6
out[6] = v1 - v2;
out[7] = d7 - d1 + (d3 - d5) * 5.25;
}
}
}
} }
template <> template <>
void winograd_transform_output<8, 3>(const framework::Tensor &input, void winograd_transform_output<8, 3>(const framework::Tensor &input,
const framework::Tensor &weight, const framework::Tensor &weight,
framework::Tensor *output) { framework::Tensor *output) {
// TODO(hjchen2) // input shape is [in_channel, h_tiles, w_tiles, 64]
PADDLE_MOBILE_THROW_EXCEPTION( // weight shape is [out_channel, in_channel, 64]
"Winograd for arm v8 has not been implemented."); int in_channel = input.dims()[0];
int h_tiles = input.dims()[1];
int w_tiles = input.dims()[2];
int tiles = h_tiles * w_tiles;
int out_channel = weight.dims()[0];
// compute U*V first
framework::Tensor output_m;
framework::DDim shape =
framework::make_ddim(std::vector<int>{out_channel, tiles, 64});
float *output_m_ptr = output_m.mutable_data<float>(shape);
memset(output_m_ptr, 0, output_m.numel() * sizeof(float));
const float *input_ptr = input.data<float>();
const float *weight_ptr = weight.data<float>();
for (int i = 0; i < out_channel; ++i) {
for (int j = 0; j < tiles; ++j) {
const float *w_ptr = weight_ptr + i * in_channel * 64;
const float *in_ptr = input_ptr + j * 64;
float *m_ptr = output_m_ptr + (i * tiles + j) * 64;
for (int c = 0; c < in_channel; ++c) {
for (int k = 0; k < 64; ++k) {
m_ptr[k] += w_ptr[k] * in_ptr[k];
}
w_ptr += 64;
in_ptr += tiles * 64;
}
}
}
for (int oc = 0; oc < out_channel; ++oc) {
for (int tile = 0; tile < tiles; ++tile) {
float *m = output_m_ptr + (oc * tiles + tile) * 64;
// compute A_T * m
float am[6][8];
for (int i = 0; i < 8; ++i) {
float d0 = m[i * 8 + 0];
float d1 = m[i * 8 + 1];
float d2 = m[i * 8 + 2];
float d3 = m[i * 8 + 3];
float d4 = m[i * 8 + 4];
float d5 = m[i * 8 + 5];
float d6 = m[i * 8 + 6];
float d7 = m[i * 8 + 7];
float v0 = d1 + d2;
float v1 = d1 - d2;
float v2 = d3 + d4;
float v3 = d3 - d4;
float v4 = d5 + d6;
float v5 = d5 - d6;
am[0][i] = d0 + v0 + v2 + 32 * v4;
am[1][i] = v1 + 2 * v3 + 16 * v5;
am[2][i] = v0 + 4 * v2 + 8 * v4;
am[3][i] = v1 + 8 * v3 + 4 * v5;
am[4][i] = v0 + 16 * v2 + 2 * v4;
am[5][i] = v1 + 32 * v3 + v5 + d7;
}
// compute A_T * m * A
for (int i = 0; i < 6; ++i, m += 8) {
float d0 = am[i][0];
float d1 = am[i][1];
float d2 = am[i][2];
float d3 = am[i][3];
float d4 = am[i][4];
float d5 = am[i][5];
float d6 = am[i][6];
float d7 = am[i][7];
float v0 = d1 + d2;
float v1 = d1 - d2;
float v2 = d3 + d4;
float v3 = d3 - d4;
float v4 = d5 + d6;
float v5 = d5 - d6;
m[0] = d0 + v0 + v2 + 32 * v4;
m[1] = v1 + 2 * v3 + 16 * v5;
m[2] = v0 + 4 * v2 + 8 * v4;
m[3] = v1 + 8 * v3 + 4 * v5;
m[4] = v0 + 16 * v2 + 2 * v4;
m[5] = v1 + 32 * v3 + v5 + d7;
}
}
}
int out_h = output->dims()[2];
int out_w = output->dims()[3];
float *output_ptr = output->mutable_data<float>();
// copy valid region to final output
for (int oc = 0; oc < out_channel; ++oc) {
int inter_h = out_h / 6;
int inter_w = out_w / 6;
int remain_h = out_h - inter_h * 6;
int remain_w = out_w - inter_w * 6;
float *out_ptr0 = output_ptr + oc * out_h * out_w;
float *out_ptr1 = out_ptr0 + out_w;
float *out_ptr2 = out_ptr1 + out_w;
float *out_ptr3 = out_ptr2 + out_w;
float *out_ptr4 = out_ptr3 + out_w;
float *out_ptr5 = out_ptr4 + out_w;
const float *m_ptr = output_m_ptr + oc * tiles * 64;
for (int tile_h = 0; tile_h < inter_h; ++tile_h) {
for (int tile_w = 0; tile_w < inter_w; ++tile_w) {
const float *m = m_ptr + (tile_h * w_tiles + tile_w) * 64;
memcpy(out_ptr0, m, 6 * sizeof(float));
memcpy(out_ptr1, m + 8, 6 * sizeof(float));
memcpy(out_ptr2, m + 16, 6 * sizeof(float));
memcpy(out_ptr3, m + 24, 6 * sizeof(float));
memcpy(out_ptr4, m + 32, 6 * sizeof(float));
memcpy(out_ptr5, m + 40, 6 * sizeof(float));
out_ptr0 += 6;
out_ptr1 += 6;
out_ptr2 += 6;
out_ptr3 += 6;
out_ptr4 += 6;
out_ptr5 += 6;
}
// remain w
if (remain_w > 0) {
const float *m = m_ptr + (tile_h * w_tiles + inter_w) * 64;
memcpy(out_ptr0, m, remain_w * sizeof(float));
memcpy(out_ptr1, m + 8, remain_w * sizeof(float));
memcpy(out_ptr2, m + 16, remain_w * sizeof(float));
memcpy(out_ptr3, m + 24, remain_w * sizeof(float));
memcpy(out_ptr4, m + 32, remain_w * sizeof(float));
memcpy(out_ptr5, m + 40, remain_w * sizeof(float));
out_ptr0 += remain_w;
out_ptr1 += remain_w;
out_ptr2 += remain_w;
out_ptr3 += remain_w;
out_ptr4 += remain_w;
out_ptr5 += remain_w;
}
out_ptr0 += 5 * out_w;
out_ptr1 += 5 * out_w;
out_ptr2 += 5 * out_w;
out_ptr3 += 5 * out_w;
out_ptr4 += 5 * out_w;
out_ptr5 += 5 * out_w;
}
// remain h
if (remain_h > 0) {
for (int tile_w = 0; tile_w < inter_w; ++tile_w) {
const float *m = m_ptr + (inter_h * w_tiles + tile_w) * 64;
for (int rh = 0; rh < remain_h; ++rh) {
memcpy(out_ptr0 + rh * out_w, m + rh * 8, 6 * sizeof(float));
}
out_ptr0 += 6;
}
if (remain_w > 0) {
const float *m = m_ptr + (inter_h * w_tiles + inter_w) * 64;
for (int rh = 0; rh < remain_h; ++rh) {
memcpy(out_ptr0 + rh * out_w, m + rh * 8, remain_w * sizeof(float));
}
}
}
}
} }
} // namespace math } // namespace math
......
...@@ -439,10 +439,11 @@ class ConvParam : public OpParam { ...@@ -439,10 +439,11 @@ class ConvParam : public OpParam {
#endif #endif
protected: public:
RType *input_; RType *input_;
RType *output_; RType *output_;
RType *filter_; RType *filter_;
RType *transformed_filter_;
vector<int> strides_; vector<int> strides_;
vector<int> paddings_; vector<int> paddings_;
vector<int> dilations_; vector<int> dilations_;
...@@ -455,7 +456,7 @@ class ConvParam : public OpParam { ...@@ -455,7 +456,7 @@ class ConvParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: public:
fpga::SplitConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
...@@ -2515,6 +2516,52 @@ class ShapeParam : public OpParam { ...@@ -2515,6 +2516,52 @@ class ShapeParam : public OpParam {
}; };
#endif #endif
#ifdef TOP_K_OP
template <typename Dtype>
class TopKParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = OpParam::GetVarValue<GType>("X", inputs, scope);
output_ = OpParam::GetVarValue<GType>("Out", outputs, scope);
indices_ = OpParam::GetVarValue<GType>("Indices", outputs, scope);
k_ = OpParam::GetAttr<int>("k", attrs);
}
public:
RType *input_;
RType *output_;
RType *indices_;
int k_;
};
#endif // TOP_K_OP
#ifdef CAST_OP
template <typename Dtype>
class CastParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = OpParam::GetVarValue<GType>("X", inputs, scope);
output_ = OpParam::GetVarValue<GType>("Out", outputs, scope);
input_type_ = OpParam::GetAttr<int>("in_dtype", attrs);
output_type_ = OpParam::GetAttr<int>("out_dtype", attrs);
}
public:
RType *input_;
RType *output_;
int input_type_;
int output_type_;
};
#endif // CAST_OP
#ifdef QUANT_OP #ifdef QUANT_OP
template <typename Dtype> template <typename Dtype>
class QuantizeParam : public OpParam { class QuantizeParam : public OpParam {
...@@ -2542,9 +2589,9 @@ class QuantizeParam : public OpParam { ...@@ -2542,9 +2589,9 @@ class QuantizeParam : public OpParam {
public: public:
// op input // op input
RType *input_; GType *input_;
// op output // op output
RType *output_; GType *output_;
RType *online_scale_; RType *online_scale_;
// quantize offline scale // quantize offline scale
RType *offline_scale_; RType *offline_scale_;
...@@ -2578,9 +2625,9 @@ class DequantizeParam : public OpParam { ...@@ -2578,9 +2625,9 @@ class DequantizeParam : public OpParam {
public: public:
// op input // op input
RType *input_; GType *input_;
// op output // op output
RType *output_; GType *output_;
RType *activation_scale_; RType *activation_scale_;
float weight_scale_; float weight_scale_;
}; };
......
...@@ -36,4 +36,4 @@ namespace ops = paddle_mobile::operators; ...@@ -36,4 +36,4 @@ namespace ops = paddle_mobile::operators;
REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp); REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
#endif #endif
#endif #endif // QUANT_OP
...@@ -43,4 +43,4 @@ class QuantizeOp : public framework::OperatorWithKernel< ...@@ -43,4 +43,4 @@ class QuantizeOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif // QUANT_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TOP_K_OP
#include "operators/top_k_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void TopKOp<DeviceType, T>::InferShape() const {
const int k = this->param_.k_;
auto dims = this->param_.input_->dims();
// should check k <= dims[-1] && k >= 1
dims[dims.size() - 1] = k;
this->param_.output_->Resize(dims);
this->param_.indices_->Resize(dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(top_k, ops::TopKOp);
#endif
#endif // TOP_K_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TOP_K_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/kernels.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class TopKOp : public framework::OperatorWithKernel<
DeviceType, TopKParam<DeviceType>,
operators::TopKKernel<DeviceType, T>> {
public:
TopKOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, TopKParam<DeviceType>,
operators::TopKKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // TOP_K_OP
...@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH) ...@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp) ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
target_link_libraries(test-inference-api paddle-mobile) target_link_libraries(test-inference-api paddle-mobile)
# gen test log
# gen test # gen test
ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp) ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
target_link_libraries(test-optimize paddle-mobile) target_link_libraries(test-optimize paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool-op paddle-mobile) target_link_libraries(test-pool-op paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-softmax paddle-mobile) target_link_libraries(test-softmax-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
...@@ -375,5 +372,8 @@ if (NOT FOUND_MATCH) ...@@ -375,5 +372,8 @@ if (NOT FOUND_MATCH)
# gen test # gen test
ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
target_link_libraries(test-super paddle-mobile) target_link_libraries(test-super paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
# gen test
ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
target_link_libraries(test-ocr paddle-mobile)
endif () endif ()
...@@ -73,14 +73,14 @@ int main() { ...@@ -73,14 +73,14 @@ int main() {
// float // float
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, float>( paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr); false, nullptr);
} }
auto time_start0 = time(); auto time_start0 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, float>( paddle_mobile::operators::math::MatMul<float, float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr); false, nullptr);
} }
...@@ -91,14 +91,14 @@ int main() { ...@@ -91,14 +91,14 @@ int main() {
// int8_t without bias // int8_t without bias
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0)); static_cast<float>(0));
} }
auto time_start1 = time(); auto time_start1 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32, aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0)); static_cast<float>(0));
} }
...@@ -109,13 +109,13 @@ int main() { ...@@ -109,13 +109,13 @@ int main() {
// int8_t with bias, column element wise add // int8_t with bias, column element wise add
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false); static_cast<float>(0), false, bias_data_col, false);
} }
auto time_start2 = time(); auto time_start2 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false); static_cast<float>(0), false, bias_data_col, false);
} }
...@@ -126,13 +126,13 @@ int main() { ...@@ -126,13 +126,13 @@ int main() {
// int8_t with bias, row element wise add // int8_t with bias, row element wise add
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true); static_cast<float>(0), false, bias_data_row, true);
} }
auto time_start3 = time(); auto time_start3 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true); static_cast<float>(0), false, bias_data_row, true);
} }
...@@ -143,13 +143,13 @@ int main() { ...@@ -143,13 +143,13 @@ int main() {
// int8_t with bias&relu // int8_t with bias&relu
// warm-up 10 times // warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false); static_cast<float>(0), true, bias_data_col, false);
} }
auto time_start4 = time(); auto time_start4 = time();
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t, int32_t>( paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8, aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false); static_cast<float>(0), true, bias_data_col, false);
} }
......
...@@ -39,6 +39,7 @@ using paddle_mobile::framework::Tensor; ...@@ -39,6 +39,7 @@ using paddle_mobile::framework::Tensor;
using paddle_mobile::framework::Variable; using paddle_mobile::framework::Variable;
using std::string; using std::string;
using std::vector; using std::vector;
template <typename DeviceType, typename OpType> template <typename DeviceType, typename OpType>
class Executor4Test : public Executor<DeviceType> { class Executor4Test : public Executor<DeviceType> {
public: public:
...@@ -48,20 +49,19 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -48,20 +49,19 @@ class Executor4Test : public Executor<DeviceType> {
this->use_optimize_ = use_optimize; this->use_optimize_ = use_optimize;
this->program_ = p; this->program_ = p;
if (this->use_optimize_) { if (this->use_optimize_) {
this->to_predict_program_ = this->program_.optimizeProgram; this->program_desc_ = this->program_.optimizeProgram;
} else { } else {
this->to_predict_program_ = this->program_.originProgram; this->program_desc_ = this->program_.originProgram;
} }
if (this->program_.originProgram == nullptr) { if (this->program_.originProgram == nullptr) {
LOG(paddle_mobile::LogLevel::kLOG_ERROR) LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr";
<< "to_predict_program_ == nullptr";
} }
const std::vector<std::shared_ptr<BlockDesc>> blocks = const std::vector<std::shared_ptr<BlockDesc>> blocks =
this->to_predict_program_->Blocks(); this->program_desc_->Blocks();
for (std::shared_ptr<BlockDesc> block_desc : blocks) { for (int block_id = 0; block_id < blocks.size(); ++block_id) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<OpDesc>> ops = blocks[block_id]->Ops();
for (int i = 0; i < ops.size(); ++i) { for (int i = 0; i < ops.size(); ++i) {
auto op = ops[i]; auto op = ops[i];
if (op->Type() == op_type) { if (op->Type() == op_type) {
...@@ -73,18 +73,16 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -73,18 +73,16 @@ class Executor4Test : public Executor<DeviceType> {
paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp( paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
op->Type(), op->GetInputs(), op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), this->program_.scope); op->GetAttrMap(), this->program_.scope);
this->ops_of_block_[*block_desc.get()].push_back(op_ptr); this->ops_of_block_[block_id].push_back(op_ptr);
break; break;
} }
} }
} }
this->InitMemory(); this->InitMemory();
for (const auto &ops : this->ops_of_block_) {
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block = for (const auto &op : ops) {
this->to_predict_program_->Block(0); op->Init();
auto &ops = this->ops_of_block_[*to_predict_block.get()]; }
for (const auto &op : ops) {
op->Init();
} }
} }
...@@ -117,12 +115,10 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -117,12 +115,10 @@ class Executor4Test : public Executor<DeviceType> {
output_tensor_sptrs[i].reset(output_tensors[i]); output_tensor_sptrs[i].reset(output_tensors[i]);
} }
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block = for (auto &ops : this->ops_of_block_) {
this->to_predict_program_->Block(0); for (auto &op : ops) {
for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size(); op->Run();
++j) { }
auto op = this->ops_of_block_[*to_predict_block.get()][j];
op->Run();
} }
return output_tensor_sptrs; return output_tensor_sptrs;
...@@ -139,14 +135,11 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -139,14 +135,11 @@ class Executor4Test : public Executor<DeviceType> {
auto *output_tensor = con_output->GetMutable<LoDTensor>(); auto *output_tensor = con_output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>(dDim); output_tensor->mutable_data<float>(dDim);
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block = for (auto &ops : this->ops_of_block_) {
this->to_predict_program_->Block(0); for (auto &op : ops) {
for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size(); op->Run();
++j) { }
auto op = this->ops_of_block_[*to_predict_block.get()][j];
op->Run();
} }
return std::make_shared<paddle_mobile::framework::Tensor>( return std::make_shared<paddle_mobile::framework::Tensor>(
paddle_mobile::framework::Tensor(*output_tensor)); paddle_mobile::framework::Tensor(*output_tensor));
} }
......
...@@ -52,15 +52,16 @@ int main(int argc, char* argv[]) { ...@@ -52,15 +52,16 @@ int main(int argc, char* argv[]) {
SetupTensor<float>(&input, in_shape, 0.f, 255.f); SetupTensor<float>(&input, in_shape, 0.f, 255.f);
// warmup // warmup
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
output = paddle_mobile.Predict(input); paddle_mobile.Predict(input);
} }
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
output = paddle_mobile.Predict(input); paddle_mobile.Predict(input);
} }
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n"; std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
std::ostringstream os("output tensor size: "); std::ostringstream os("output tensor size: ");
output = paddle_mobile.Fetch();
os << output->numel() << "\n" << output->data<float>()[0]; os << output->numel() << "\n" << output->data<float>()[0];
for (int i = 1; i < output->numel(); ++i) { for (int i = 1; i < output->numel(); ++i) {
os << ", " << output->data<float>()[i]; os << ", " << output->data<float>()[i];
......
...@@ -36,11 +36,11 @@ int main() { ...@@ -36,11 +36,11 @@ int main() {
input_tensor.data<float>() + input_tensor.numel()); input_tensor.data<float>() + input_tensor.numel());
// 预热十次 // 预热十次
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
paddle_mobile.PredictLod(input_tensor); paddle_mobile.Predict(input_tensor);
} }
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
paddle_mobile.PredictLod(input_tensor); paddle_mobile.Predict(input_tensor);
} }
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
if (argc < 2) { if (argc < 2) {
std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n" std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n"
<< "feed_shape: input tensor shape, such as 1,3,224,224.\n" << "feed_shape: input tensor shape, such as 3,224,224.\n"
<< "thread_num: optional int, threads count, default is 1.\n" << "thread_num: optional int, threads count, default is 1.\n"
<< "use_fuse: optional bool, default is 0.\n"; << "use_fuse: optional bool, default is 0.\n";
return 1; return 1;
...@@ -41,18 +41,18 @@ int main(int argc, char* argv[]) { ...@@ -41,18 +41,18 @@ int main(int argc, char* argv[]) {
#endif #endif
paddle_mobile.SetThreadNum(thread_num); paddle_mobile.SetThreadNum(thread_num);
auto time1 = time(); auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) { std::vector<float> output;
if (paddle_mobile.Load(g_googlenet, optimize, false, 1, true)) {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
<< std::endl; << std::endl;
std::vector<float> input; std::vector<float> input;
std::vector<float> output;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
if (feed_shape) { if (feed_shape) {
sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]); sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);
} }
std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", " std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
<< dims[2] << ", " << dims[3] << "]\n"; << dims[2] << ", " << dims[3] << "]" << std::endl;
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
// warmup // warmup
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
......
...@@ -48,8 +48,8 @@ int main() { ...@@ -48,8 +48,8 @@ int main() {
DLOG << "words lod 22: " << words.lod(); DLOG << "words lod 22: " << words.lod();
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
auto vec_result = paddle_mobile.PredictLod(words); paddle_mobile.Predict(words);
DLOG << *vec_result; DLOG << *paddle_mobile.Fetch();
} }
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms" std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
...@@ -84,8 +84,8 @@ int main() { ...@@ -84,8 +84,8 @@ int main() {
DLOG << "words lod 22: " << words.lod(); DLOG << "words lod 22: " << words.lod();
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 1; ++i) { for (int i = 0; i < 1; ++i) {
auto vec_result = paddle_mobile.PredictLod(words); paddle_mobile.Predict(words);
DLOG << *vec_result; DLOG << *paddle_mobile.Fetch();
} }
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms" std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
void load_images(const char *image_dir, const char *images_list,
std::vector<std::string> *image_names,
std::vector<std::pair<int, int>> *image_shapes) {
int height, width;
std::string filename;
std::ifstream if_list(images_list, std::ios::in);
while (!if_list.eof()) {
if_list >> height >> width >> filename;
image_shapes->push_back(std::make_pair(height, width));
image_names->push_back(filename);
}
if_list.close();
}
int main(int argc, char **argv) {
if (argc < 4) {
std::cerr << "Usage: ./test_ocr model_dir image_dir images_list."
<< std::endl;
return 1;
}
char *model_dir = argv[1];
char *image_dir = argv[2];
char *images_list = argv[3];
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(8);
auto isok = paddle_mobile.Load(std::string(model_dir) + "/model",
std::string(model_dir) + "/params", true,
false, 1, true);
DLOG << "pass init model";
std::vector<std::string> image_names;
std::vector<std::pair<int, int>> image_shapes;
load_images(image_dir, images_list, &image_names, &image_shapes);
DLOG << "pass load images";
for (int i = 0; i < image_names.size(); i++) {
std::string file_name = image_names[i];
std::vector<float> input_vec;
std::vector<int64_t> dims{1, 1, 48, 512};
dims[2] = image_shapes[i].first;
dims[3] = image_shapes[i].second;
// load input image
std::string img_path = std::string(image_dir) + "/" + file_name;
std::cerr << "img_path: " << img_path << std::endl;
std::cerr << "shape = [" << dims[0] << ", " << dims[1] << ", " << dims[2]
<< ", " << dims[3] << "]" << std::endl;
GetInput<float>(img_path, &input_vec, dims);
framework::Tensor input(input_vec, framework::make_ddim(dims));
// predict
paddle_mobile.Predict(input);
auto output_topk = paddle_mobile.Fetch("top_k_1.tmp_0");
auto output_indices = paddle_mobile.Fetch("cast_68.tmp_0");
// print result
std::cerr << file_name << std::endl;
std::cerr << output_topk->data<float>()[0];
for (int j = 1; j < output_topk->numel(); ++j) {
std::cerr << " " << output_topk->data<float>()[j];
}
std::cerr << std::endl;
std::cerr << output_indices->data<float>()[0];
for (int j = 1; j < output_indices->numel(); ++j) {
std::cerr << " " << output_indices->data<float>()[j];
}
std::cerr << std::endl;
}
return 0;
}
...@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <math.h>
#include <limits>
#include "../test_include.h" #include "../test_include.h"
#include "operators/softmax_op.h" #include "operators/softmax_op.h"
int main() { namespace paddle_mobile {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_mobilenet)); void Softmax(const framework::Tensor *X, framework::Tensor *Y) {
if (program.originProgram == nullptr) { const framework::DDim &dims = X->dims();
DLOG << "program read file"; int batch_size = dims[0];
int num_classes = dims[dims.size() - 1];
int channels = X->numel() / batch_size / num_classes;
const float *x = X->data<float>();
float *y = Y->mutable_data<float>();
for (int batch = 0; batch < batch_size; ++batch) {
for (int c = 0; c < channels; ++c) {
size_t offset = (batch * channels + c) * num_classes;
const float *input = x + offset;
float *output = y + offset;
float max = -std::numeric_limits<float>::max();
for (int j = 0; j < num_classes; ++j) {
max = (input[j] > max) ? input[j] : max;
}
float sum = 0.f;
for (int j = 0; j < num_classes; ++j) {
float tmp = std::expf(input[j] - max);
sum += tmp;
output[j] = tmp;
}
for (int j = 0; j < num_classes; ++j) {
output[j] /= sum;
}
}
} }
Executor4Test<paddle_mobile::CPU, }
paddle_mobile::operators::SoftmaxOp<paddle_mobile::CPU, float>>
executor(program, "softmax"); int TestSoftmaxOp(const std::vector<int> input_shape) {
paddle_mobile::framework::Tensor input; framework::DDim dims = framework::make_ddim(input_shape);
SetupTensor<float>(&input, {1, 1000}, static_cast<float>(0), VariableNameMap inputs;
static_cast<float>(1)); VariableNameMap outputs;
auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000}); auto scope = std::make_shared<framework::Scope>();
auto output = inputs["X"] = std::vector<std::string>({"input"});
executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim); outputs["Out"] = std::vector<std::string>({"output"});
auto *output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) { auto input_var = scope.get()->Var("input");
DLOG << " value of output: " << output_ptr[j]; auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<float>(input, dims, -100.0, 100.0);
auto output_var = scope.get()->Var("output");
auto output = output_var->template Get<framework::LoDTensor>();
framework::AttributeMap attrs;
auto *op = new operators::SoftmaxOp<CPU, float>("softmax", inputs, outputs,
attrs, scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
Softmax(input, &output_cmp);
const float *output_data = output->data<float>();
for (int i = 0; i < output->numel(); ++i) {
float gap = output_data[i] - output_cmp_data[i];
if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
<< ", output_cmp_data[" << i
<< "] = " << output_cmp_data[i];
delete op;
exit(1);
}
} }
delete op;
return 0;
}
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
TestSoftmaxOp({128, 1000});
TestSoftmaxOp({128, 10, 1000});
return 0; return 0;
} }
...@@ -247,6 +247,8 @@ if(NOT FOUND_MATCH) ...@@ -247,6 +247,8 @@ if(NOT FOUND_MATCH)
set(SHAPE_OP ON) set(SHAPE_OP ON)
set(ELEMENTWISEMUL_OP ON) set(ELEMENTWISEMUL_OP ON)
set(SUM_OP ON) set(SUM_OP ON)
set(TOP_K_OP ON)
set(CAST_OP ON)
set(QUANT_OP ON) set(QUANT_OP ON)
set(DEQUANT_OP ON) set(DEQUANT_OP ON)
set(FUSION_DEQUANT_BN_OP ON) set(FUSION_DEQUANT_BN_OP ON)
...@@ -449,7 +451,12 @@ endif() ...@@ -449,7 +451,12 @@ endif()
if (SUM_OP) if (SUM_OP)
add_definitions(-DSUM_OP) add_definitions(-DSUM_OP)
endif() endif()
if (TOP_K_OP)
add_definitions(-DTOP_K_OP)
endif()
if (CAST_OP)
add_definitions(-DCAST_OP)
endif()
if (QUANT_OP) if (QUANT_OP)
add_definitions(-DQUANT_OP) add_definitions(-DQUANT_OP)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册